[llvm] [LoadStoreVectorizer] Postprocess and merge equivalence classes (PR #114501)
Vyacheslav Klochkov via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 1 09:34:09 PDT 2024
https://github.com/v-klochkov updated https://github.com/llvm/llvm-project/pull/114501
>From 6cbe171bdc5f05e6e85a5fc4540ac93057a576e2 Mon Sep 17 00:00:00 2001
From: "Klochkov, Vyacheslav N" <vyacheslav.n.klochkov at intel.com>
Date: Wed, 30 Oct 2024 19:29:41 -0700
Subject: [PATCH 1/2] [LoadStoreVectorizer] Postprocess and merge equivalence
classes
This patch introduces a new method:
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const
The method is called at the end of Vectorizer::collectEquivalenceClasses() and is
needed to merge equivalence classes that differ only by their underlying objects
(UB1 and UB2), where UB1 is 1-level-indirection underlying base for UB2.
This situation arises due to the limited lookup depth used during the search of
underlying bases with llvm::getUnderlyingObject(ptr).
Using any fixed lookup depth can result into creation of multiple equivalence
classes that only differ by 1-level indirection bases.
The new approach merges equivalence classes if they have adjucent bases (1-level indirection).
If a series of equivalence classes form ladder formed of 1-step/level indirections,
they are all merged into a single equivalence class.
This provides more opportunities for the load-store vectorizer to generate better vectors.
Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov at intel.com>
---
.../Vectorize/LoadStoreVectorizer.cpp | 128 ++++++++++++++++++
.../X86/massive_indirection.ll | 63 +++++++++
2 files changed, 191 insertions(+)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..59c7f2239d972a 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,11 @@ class Vectorizer {
Instruction *ChainElem, Instruction *ChainBegin,
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+ /// Merges the equivalence classes if they have uderlying objects that differ
+ /// by one level of indirection (i.e., one is a getelementptr and the other is
+ /// the base pointer in that getelementptr).
+ void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
+
/// Collects loads and stores grouped by "equivalence class", where:
/// - all elements in an eq class are a load or all are a store,
/// - they all load/store the same element size (it's OK to have e.g. i8 and
@@ -1305,6 +1310,128 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
return std::nullopt;
}
+void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
+ if (EQClasses.size() < 2) // There is nothing to merge.
+ return;
+
+ // The reduced key has all elements of the ECClassKey except the underlying
+ // object. Check that EqClassKey has 4 elements and define the reduced key.
+ static_assert(std::tuple_size_v<EqClassKey> == 4,
+ "EqClassKey has changed - EqClassReducedKey needs changes too");
+ using EqClassReducedKey =
+ std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
+ std::tuple_element_t<2, EqClassKey> /* Element size */,
+ std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
+ using ECReducedKeyToUnderlyingObjectMap =
+ MapVector<EqClassReducedKey,
+ SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
+
+ // Form a map from the reduced key (without the underlying object) to the
+ // underlying objects: 1 reduced key to many underlying objects, to form
+ // groups of potentially merge-able equivalence classes.
+ ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
+ bool FoundPotentiallyOptimizableEC = false;
+ for (const auto &EC : EQClasses) {
+ const auto &Key = EC.first;
+ EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
+ std::get<3>(Key)};
+ RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
+ if (RedKeyToUOMap[RedKey].size() > 1)
+ FoundPotentiallyOptimizableEC = true;
+ }
+ if (!FoundPotentiallyOptimizableEC)
+ return;
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
+ for (const auto &EC : EQClasses) {
+ dbgs() << " Key: ([" << std::get<0>(EC.first)
+ << "]: " << *std::get<0>(EC.first) << ", " << std::get<1>(EC.first)
+ << ", " << std::get<2>(EC.first) << ", "
+ << static_cast<int>(std::get<3>(EC.first)) << ")\n";
+ for (const auto &Inst : EC.second)
+ dbgs() << "\tInst:\t" << *Inst << "\n";
+ }
+ });
+ LLVM_DEBUG({
+ dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
+ for (const auto &RedKeyToUO : RedKeyToUOMap) {
+ dbgs() << " Reduced key: (" << std::get<0>(RedKeyToUO.first) << ", "
+ << std::get<1>(RedKeyToUO.first) << ", "
+ << static_cast<int>(std::get<2>(RedKeyToUO.first)) << ") --> "
+ << RedKeyToUO.second.size() << " underlying objects:\n";
+ for (auto UObject : RedKeyToUO.second)
+ dbgs() << " [" << UObject << "]: " << *UObject << "\n";
+ }
+ });
+
+ using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
+
+ // Compute the ultimate targets for a set of underlying objects.
+ auto GetUltimateTargets =
+ [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
+ UObjectToUObjectMap IndirectionMap;
+ for (const auto *UObject : UObjects) {
+ const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
+ const auto *UltimateTarget =
+ llvm::getUnderlyingObject(UObject, MaxLookupDepth);
+ if (UltimateTarget != UObject)
+ IndirectionMap[UObject] = UltimateTarget;
+ }
+ UObjectToUObjectMap UltimateTargetsMap;
+ for (const auto *UObject : UObjects) {
+ auto Target = UObject;
+ auto It = IndirectionMap.find(Target);
+ for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
+ Target = It->second;
+ UltimateTargetsMap[UObject] = Target;
+ }
+ return UltimateTargetsMap;
+ };
+
+ // For each item in RedKeyToUOMap, if it has more than one underlying object,
+ // try to merge the equivalence classes.
+ for (auto &RedKeyToUO : RedKeyToUOMap) {
+ auto UObjects = RedKeyToUO.second;
+ if (UObjects.size() < 2)
+ continue;
+ const auto RedKey = RedKeyToUO.first;
+ auto UTMap = GetUltimateTargets(UObjects);
+ for (const auto &UT : UTMap) {
+ const Value *UObject = UT.first;
+ const Value *UltimateTarget = UT.second;
+ if (UObject == UltimateTarget)
+ continue;
+
+ EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
+ std::get<2>(RedKey)};
+ EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
+ std::get<2>(RedKey)};
+ auto VecFrom = EQClasses[KeyFrom];
+ auto VecTo = EQClasses[KeyTo];
+ SmallVector<Instruction *, 8> MergedVec;
+ std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
+ std::back_inserter(MergedVec),
+ [](Instruction *A, Instruction *B) {
+ return A && B && A->comesBefore(B);
+ });
+ EQClasses[KeyTo] = std::move(MergedVec);
+ EQClasses.erase(KeyFrom);
+ }
+ }
+ LLVM_DEBUG({
+ dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
+ for (const auto &EC : EQClasses) {
+ dbgs() << " Key: ([" << std::get<0>(EC.first)
+ << "]: " << *std::get<0>(EC.first) << ", " << std::get<1>(EC.first)
+ << ", " << std::get<2>(EC.first) << ", "
+ << static_cast<int>(std::get<3>(EC.first)) << ")\n";
+ for (const auto &Inst : EC.second)
+ dbgs() << "\tInst:\t" << *Inst << "\n";
+ }
+ });
+}
+
EquivalenceClassMap
Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
BasicBlock::iterator End) {
@@ -1377,6 +1504,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
.emplace_back(&I);
}
+ mergeEquivalenceClasses(Ret);
return Ret;
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
new file mode 100644
index 00000000000000..ab320f02ed937d
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -0,0 +1,63 @@
+; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o %t.out.ll
+; RUN: FileCheck -input-file=%t.out.ll %s
+
+; This test verifies that the vectorizer can handle an extended sequence of
+; getelementptr instructions and generate longer vectors. With special handling,
+; some elements can still be vectorized even if they require looking up the
+; common underlying object deeper than 6 levels from the original pointer.
+
+; The test below is the simplified version of actual performance oriented
+; workload; the offsets in getelementptr instructins are similar or same for
+; the test simplicity.
+
+define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: @v1_v2_v4_v1_to_v8_levels_6_7_8_8
+; CHECK: store <8 x half>
+
+ %level1 = getelementptr inbounds i8, ptr %arg1, i32 917504
+ %level2 = getelementptr i8, ptr %level1, i32 %arg0
+ %level3 = getelementptr i8, ptr %level2, i32 32768
+ %level4 = getelementptr inbounds i8, ptr %level3, i32 %arg0
+ %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+ %a6 = getelementptr i8, ptr %level5, i32 %arg0
+ %b7 = getelementptr i8, ptr %a6, i32 2
+ %c8 = getelementptr i8, ptr %b7, i32 8
+ %d8 = getelementptr inbounds i8, ptr %b7, i32 12
+
+ store half 0xH0000, ptr %a6, align 16
+ store <4 x half> zeroinitializer, ptr %b7, align 2
+ store <2 x half> zeroinitializer, ptr %c8, align 2
+ store half 0xH0000, ptr %d8, align 2
+ ret void
+}
+
+define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: @v1x8_levels_6_7_8_9_10_11_12_13
+; CHECK: store <8 x half>
+
+ %level1 = getelementptr inbounds i8, ptr %arg1, i32 917504
+ %level2 = getelementptr i8, ptr %level1, i32 %arg0
+ %level3 = getelementptr i8, ptr %level2, i32 32768
+ %level4 = getelementptr inbounds i8, ptr %level3, i32 %arg0
+ %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+ %a6 = getelementptr i8, ptr %level5, i32 %arg0
+ %b7 = getelementptr i8, ptr %a6, i32 2
+ %c8 = getelementptr i8, ptr %b7, i32 2
+ %d9 = getelementptr inbounds i8, ptr %c8, i32 2
+ %e10 = getelementptr inbounds i8, ptr %d9, i32 2
+ %f11 = getelementptr inbounds i8, ptr %e10, i32 2
+ %g12 = getelementptr inbounds i8, ptr %f11, i32 2
+ %h13 = getelementptr inbounds i8, ptr %g12, i32 2
+
+ store half 0xH0000, ptr %a6, align 16
+ store half 0xH0000, ptr %b7, align 2
+ store half 0xH0000, ptr %c8, align 2
+ store half 0xH0000, ptr %d9, align 2
+ store half 0xH0000, ptr %e10, align 8
+ store half 0xH0000, ptr %f11, align 2
+ store half 0xH0000, ptr %g12, align 2
+ store half 0xH0000, ptr %h13, align 2
+ ret void
+}
>From 91a74c80e89ff46dcaf15dad30f9667e082d5979 Mon Sep 17 00:00:00 2001
From: "Klochkov, Vyacheslav N" <vyacheslav.n.klochkov at intel.com>
Date: Fri, 1 Nov 2024 09:09:04 -0700
Subject: [PATCH 2/2] [NFC] Address review comments: fix misprint + use C++17
struct binding
---
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 8 ++------
.../LoadStoreVectorizer/X86/massive_indirection.ll | 2 +-
2 files changed, 3 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 59c7f2239d972a..699fea8872dc58 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1391,15 +1391,11 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
// For each item in RedKeyToUOMap, if it has more than one underlying object,
// try to merge the equivalence classes.
- for (auto &RedKeyToUO : RedKeyToUOMap) {
- auto UObjects = RedKeyToUO.second;
+ for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
if (UObjects.size() < 2)
continue;
- const auto RedKey = RedKeyToUO.first;
auto UTMap = GetUltimateTargets(UObjects);
- for (const auto &UT : UTMap) {
- const Value *UObject = UT.first;
- const Value *UltimateTarget = UT.second;
+ for (const auto &[UObject, UltimateTarget] : UTMap) {
if (UObject == UltimateTarget)
continue;
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
index ab320f02ed937d..b909f354393fab 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -7,7 +7,7 @@
; common underlying object deeper than 6 levels from the original pointer.
; The test below is the simplified version of actual performance oriented
-; workload; the offsets in getelementptr instructins are similar or same for
+; workload; the offsets in getelementptr instructions are similar or same for
; the test simplicity.
define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
More information about the llvm-commits
mailing list