[llvm] ac65fb8 - [LoopVectorize] Fix incorrect order of invariant stores when there are multiple reductions.

Thu Aug 31 09:22:44 PDT 2023

Author: Igor Kirillov
Date: 2023-08-31T16:21:44Z
New Revision: ac65fb869977185b44757b94dc5130bd08c6f7e2

URL: https://github.com/llvm/llvm-project/commit/ac65fb869977185b44757b94dc5130bd08c6f7e2
DIFF: https://github.com/llvm/llvm-project/commit/ac65fb869977185b44757b94dc5130bd08c6f7e2.diff

LOG: [LoopVectorize] Fix incorrect order of invariant stores when there are multiple reductions.

When a loop has multiple reductions, each with an intermediate invariant
store, the order in which those reductions are processed is not considered.
This can result in the invariant stores outside the loop not preserving the
original order.
This patch sorts VPReductionPHIRecipes by the order in which they have
stores in the original loop before running
`InnerLoopVectorizer::fixReduction` function, and it helps to maintain
the correct order of stores.

Fixes https://github.com/llvm/llvm-project/issues/64047

Differential Revision: https://reviews.llvm.org/D157631

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c11ac8e560867e..f60ea9b9fe2bd5 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3617,10 +3617,44 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   // the incoming edges.
   VPBasicBlock *Header =
       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
+
+  // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
+  // sank outside of the loop would keep the same order as they had in the
+  // original loop.
+  SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
   for (VPRecipeBase &R : Header->phis()) {
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
-      fixReduction(ReductionPhi, State);
-    else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+      ReductionPHIList.emplace_back(ReductionPhi);
+  }
+  stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
+                                       const VPReductionPHIRecipe *R2) {
+    auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
+    auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
+
+    // If neither of the recipes has an intermediate store, keep the order the
+    // same.
+    if (!IS1 && !IS2)
+      return false;
+
+    // If only one of the recipes has an intermediate store, then move it
+    // towards the beginning of the list.
+    if (IS1 && !IS2)
+      return true;
+
+    if (!IS1 && IS2)
+      return false;
+
+    // If both recipes have an intermediate store, then the recipe with the
+    // later store should be processed earlier. So it should go to the beginning
+    // of the list.
+    return DT->dominates(IS2, IS1);
+  });
+
+  for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
+    fixReduction(ReductionPhi, State);
+
+  for (VPRecipeBase &R : Header->phis()) {
+    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       fixFixedOrderRecurrence(FOR, State);
   }
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 96f15659797185..1cbe9d70aec2f0 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -558,14 +558,13 @@ exit:                                             ; preds = %for.body
 }
 
 ; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved
-; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
 ; See https://github.com/llvm/llvm-project/issues/64047
 define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst, align 4
 ;
 entry:
@@ -619,14 +618,13 @@ exit:
 }
 
 ; Same as above but storing is done to two 
diff erent pointers and they can be aliased
-; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
 define void @reduc_add_mul_store_
diff erent_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_add_mul_store_
diff erent_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst2, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst1, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
+; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst2, align 4
 ;
 entry:
   br label %for.body