[llvm] [LoadStoreVectorizer] Postprocess and merge equivalence classes (PR #114501)

Vyacheslav Klochkov via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 1 09:34:09 PDT 2024


https://github.com/v-klochkov updated https://github.com/llvm/llvm-project/pull/114501

>From 6cbe171bdc5f05e6e85a5fc4540ac93057a576e2 Mon Sep 17 00:00:00 2001
From: "Klochkov, Vyacheslav N" <vyacheslav.n.klochkov at intel.com>
Date: Wed, 30 Oct 2024 19:29:41 -0700
Subject: [PATCH 1/2] [LoadStoreVectorizer] Postprocess and merge equivalence
 classes

This patch introduces a new method:
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const

The method is called at the end of Vectorizer::collectEquivalenceClasses() and is
needed to merge equivalence classes that differ only by their underlying objects
(UB1 and UB2), where UB1 is 1-level-indirection underlying base for UB2.
This situation arises due to the limited lookup depth used during the search of
underlying bases with llvm::getUnderlyingObject(ptr).

Using any fixed lookup depth can result into creation of multiple equivalence
classes that only differ  by 1-level indirection bases.

The new approach merges equivalence classes if they have adjucent bases (1-level indirection).
If a series of equivalence classes form ladder formed of 1-step/level indirections,
they are all merged into a single equivalence class.
This provides more opportunities for the load-store vectorizer to generate better vectors.

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov at intel.com>
---
 .../Vectorize/LoadStoreVectorizer.cpp         | 128 ++++++++++++++++++
 .../X86/massive_indirection.ll                |  63 +++++++++
 2 files changed, 191 insertions(+)
 create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..59c7f2239d972a 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,11 @@ class Vectorizer {
       Instruction *ChainElem, Instruction *ChainBegin,
       const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
 
+  /// Merges the equivalence classes if they have uderlying objects that differ
+  /// by one level of indirection (i.e., one is a getelementptr and the other is
+  /// the base pointer in that getelementptr).
+  void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
+
   /// Collects loads and stores grouped by "equivalence class", where:
   ///   - all elements in an eq class are a load or all are a store,
   ///   - they all load/store the same element size (it's OK to have e.g. i8 and
@@ -1305,6 +1310,128 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
   return std::nullopt;
 }
 
+void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
+  if (EQClasses.size() < 2) // There is nothing to merge.
+    return;
+
+  // The reduced key has all elements of the ECClassKey except the underlying
+  // object. Check that EqClassKey has 4 elements and define the reduced key.
+  static_assert(std::tuple_size_v<EqClassKey> == 4,
+                "EqClassKey has changed - EqClassReducedKey needs changes too");
+  using EqClassReducedKey =
+      std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
+                 std::tuple_element_t<2, EqClassKey> /* Element size */,
+                 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
+  using ECReducedKeyToUnderlyingObjectMap =
+      MapVector<EqClassReducedKey,
+                SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
+
+  // Form a map from the reduced key (without the underlying object) to the
+  // underlying objects: 1 reduced key to many underlying objects, to form
+  // groups of potentially merge-able equivalence classes.
+  ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
+  bool FoundPotentiallyOptimizableEC = false;
+  for (const auto &EC : EQClasses) {
+    const auto &Key = EC.first;
+    EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
+                             std::get<3>(Key)};
+    RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
+    if (RedKeyToUOMap[RedKey].size() > 1)
+      FoundPotentiallyOptimizableEC = true;
+  }
+  if (!FoundPotentiallyOptimizableEC)
+    return;
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
+    for (const auto &EC : EQClasses) {
+      dbgs() << "    Key: ([" << std::get<0>(EC.first)
+             << "]: " << *std::get<0>(EC.first) << ", " << std::get<1>(EC.first)
+             << ", " << std::get<2>(EC.first) << ", "
+             << static_cast<int>(std::get<3>(EC.first)) << ")\n";
+      for (const auto &Inst : EC.second)
+        dbgs() << "\tInst:\t" << *Inst << "\n";
+    }
+  });
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
+    for (const auto &RedKeyToUO : RedKeyToUOMap) {
+      dbgs() << "    Reduced key: (" << std::get<0>(RedKeyToUO.first) << ", "
+             << std::get<1>(RedKeyToUO.first) << ", "
+             << static_cast<int>(std::get<2>(RedKeyToUO.first)) << ") --> "
+             << RedKeyToUO.second.size() << " underlying objects:\n";
+      for (auto UObject : RedKeyToUO.second)
+        dbgs() << "    [" << UObject << "]: " << *UObject << "\n";
+    }
+  });
+
+  using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
+
+  // Compute the ultimate targets for a set of underlying objects.
+  auto GetUltimateTargets =
+      [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
+    UObjectToUObjectMap IndirectionMap;
+    for (const auto *UObject : UObjects) {
+      const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
+      const auto *UltimateTarget =
+          llvm::getUnderlyingObject(UObject, MaxLookupDepth);
+      if (UltimateTarget != UObject)
+        IndirectionMap[UObject] = UltimateTarget;
+    }
+    UObjectToUObjectMap UltimateTargetsMap;
+    for (const auto *UObject : UObjects) {
+      auto Target = UObject;
+      auto It = IndirectionMap.find(Target);
+      for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
+        Target = It->second;
+      UltimateTargetsMap[UObject] = Target;
+    }
+    return UltimateTargetsMap;
+  };
+
+  // For each item in RedKeyToUOMap, if it has more than one underlying object,
+  // try to merge the equivalence classes.
+  for (auto &RedKeyToUO : RedKeyToUOMap) {
+    auto UObjects = RedKeyToUO.second;
+    if (UObjects.size() < 2)
+      continue;
+    const auto RedKey = RedKeyToUO.first;
+    auto UTMap = GetUltimateTargets(UObjects);
+    for (const auto &UT : UTMap) {
+      const Value *UObject = UT.first;
+      const Value *UltimateTarget = UT.second;
+      if (UObject == UltimateTarget)
+        continue;
+
+      EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
+                         std::get<2>(RedKey)};
+      EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
+                       std::get<2>(RedKey)};
+      auto VecFrom = EQClasses[KeyFrom];
+      auto VecTo = EQClasses[KeyTo];
+      SmallVector<Instruction *, 8> MergedVec;
+      std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
+                 std::back_inserter(MergedVec),
+                 [](Instruction *A, Instruction *B) {
+                   return A && B && A->comesBefore(B);
+                 });
+      EQClasses[KeyTo] = std::move(MergedVec);
+      EQClasses.erase(KeyFrom);
+    }
+  }
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
+    for (const auto &EC : EQClasses) {
+      dbgs() << "    Key: ([" << std::get<0>(EC.first)
+             << "]: " << *std::get<0>(EC.first) << ", " << std::get<1>(EC.first)
+             << ", " << std::get<2>(EC.first) << ", "
+             << static_cast<int>(std::get<3>(EC.first)) << ")\n";
+      for (const auto &Inst : EC.second)
+        dbgs() << "\tInst:\t" << *Inst << "\n";
+    }
+  });
+}
+
 EquivalenceClassMap
 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                       BasicBlock::iterator End) {
@@ -1377,6 +1504,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
         .emplace_back(&I);
   }
 
+  mergeEquivalenceClasses(Ret);
   return Ret;
 }
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
new file mode 100644
index 00000000000000..ab320f02ed937d
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -0,0 +1,63 @@
+; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o %t.out.ll
+; RUN: FileCheck -input-file=%t.out.ll %s
+
+; This test verifies that the vectorizer can handle an extended sequence of
+; getelementptr instructions and generate longer vectors. With special handling,
+; some elements can still be vectorized even if they require looking up the
+; common underlying object deeper than 6 levels from the original pointer.
+
+; The test below is the simplified version of actual performance oriented
+; workload; the offsets in getelementptr instructins are similar or same for
+; the test simplicity.
+
+define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: @v1_v2_v4_v1_to_v8_levels_6_7_8_8
+; CHECK: store <8 x half>
+
+  %level1 = getelementptr inbounds i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr inbounds i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 8
+  %d8 = getelementptr inbounds i8, ptr %b7, i32 12
+
+  store half 0xH0000, ptr %a6, align 16
+  store <4 x half> zeroinitializer, ptr %b7, align 2
+  store <2 x half> zeroinitializer, ptr %c8, align 2
+  store half 0xH0000, ptr %d8, align 2
+  ret void
+}
+
+define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: @v1x8_levels_6_7_8_9_10_11_12_13
+; CHECK: store <8 x half>
+
+  %level1 = getelementptr inbounds i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr inbounds i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 2
+  %d9 = getelementptr inbounds i8, ptr %c8, i32 2
+  %e10 = getelementptr inbounds i8, ptr %d9, i32 2
+  %f11 = getelementptr inbounds i8, ptr %e10, i32 2
+  %g12 = getelementptr inbounds i8, ptr %f11, i32 2
+  %h13 = getelementptr inbounds i8, ptr %g12, i32 2
+
+  store half 0xH0000, ptr %a6, align 16
+  store half 0xH0000, ptr %b7, align 2
+  store half 0xH0000, ptr %c8, align 2
+  store half 0xH0000, ptr %d9, align 2
+  store half 0xH0000, ptr %e10, align 8
+  store half 0xH0000, ptr %f11, align 2
+  store half 0xH0000, ptr %g12, align 2
+  store half 0xH0000, ptr %h13, align 2
+  ret void
+}

>From 91a74c80e89ff46dcaf15dad30f9667e082d5979 Mon Sep 17 00:00:00 2001
From: "Klochkov, Vyacheslav N" <vyacheslav.n.klochkov at intel.com>
Date: Fri, 1 Nov 2024 09:09:04 -0700
Subject: [PATCH 2/2] [NFC] Address review comments: fix misprint + use C++17
 struct binding

---
 llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp     | 8 ++------
 .../LoadStoreVectorizer/X86/massive_indirection.ll        | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 59c7f2239d972a..699fea8872dc58 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1391,15 +1391,11 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
 
   // For each item in RedKeyToUOMap, if it has more than one underlying object,
   // try to merge the equivalence classes.
-  for (auto &RedKeyToUO : RedKeyToUOMap) {
-    auto UObjects = RedKeyToUO.second;
+  for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
     if (UObjects.size() < 2)
       continue;
-    const auto RedKey = RedKeyToUO.first;
     auto UTMap = GetUltimateTargets(UObjects);
-    for (const auto &UT : UTMap) {
-      const Value *UObject = UT.first;
-      const Value *UltimateTarget = UT.second;
+    for (const auto &[UObject, UltimateTarget] : UTMap) {
       if (UObject == UltimateTarget)
         continue;
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
index ab320f02ed937d..b909f354393fab 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -7,7 +7,7 @@
 ; common underlying object deeper than 6 levels from the original pointer.
 
 ; The test below is the simplified version of actual performance oriented
-; workload; the offsets in getelementptr instructins are similar or same for
+; workload; the offsets in getelementptr instructions are similar or same for
 ; the test simplicity.
 
 define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {



More information about the llvm-commits mailing list