[llvm] 9f3c559 - [SLP]Fix loads sorting for loads from diffrent basic blocks

Tue Oct 8 13:44:19 PDT 2024

Author: Alexey Bataev
Date: 2024-10-08T16:44:16-04:00
New Revision: 9f3c55954eaa71910caa0abbb404db02d3a104c1

URL: https://github.com/llvm/llvm-project/commit/9f3c55954eaa71910caa0abbb404db02d3a104c1
DIFF: https://github.com/llvm/llvm-project/commit/9f3c55954eaa71910caa0abbb404db02d3a104c1.diff

LOG: [SLP]Fix loads sorting for loads from diffrent basic blocks

Patch fixes lookup for loads from different basic blocks. Originally,
the code checked is the main key (combined with parent basic block) was
created, but did not include the key into LoadsMap. When the code looked for
the load pointer in LoadsMap, it skipped check for parent basic block
and could mix loads from different basic blocks (but the same underlying
pointer). Currently, it does lead to any issues, since later the code
compares parent basic blocks and sorts loads properly. But it increases
compile time and affects compile time.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/111521

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 318a61208fe393..553df1c08f3aeb 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -18588,14 +18588,14 @@ class HorizontalReduction {
         8>
         PossibleReducedVals;
     initReductionOps(Root);
-    DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
+    DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
     SmallSet<size_t, 2> LoadKeyUsed;
 
     auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
       Key = hash_combine(hash_value(LI->getParent()), Key);
       Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
-      if (LoadKeyUsed.contains(Key)) {
-        auto LIt = LoadsMap.find(Ptr);
+      if (!LoadKeyUsed.insert(Key).second) {
+        auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
         if (LIt != LoadsMap.end()) {
           for (LoadInst *RLI : LIt->second) {
             if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
@@ -18617,8 +18617,8 @@ class HorizontalReduction {
           }
         }
       }
-      LoadKeyUsed.insert(Key);
-      LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
+      LoadsMap.try_emplace(std::make_pair(Key, Ptr))
+          .first->second.push_back(LI);
       return hash_value(LI->getPointerOperand());
     };
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index d1f93eccc2a91d..a7201e776fb4cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -980,10 +980,10 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE4-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
 ; SSE4-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
 ; SSE4-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP2]]
-; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP2]]
-; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP5]], i32 [[TMP6]]
+; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
+; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]]
+; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP6]], [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP6]], i32 [[TMP2]]
 ; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
 ; SSE4-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
@@ -999,10 +999,10 @@ define i32 @maxi8_wrong_parent(i32) {
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
 ; AVX-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP2]]
-; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP2]]
-; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
+; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP6]], [[TMP2]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP6]], i32 [[TMP2]]
 ; AVX-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; AVX-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
 ; AVX-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]