[llvm] [LSV] Insert casts to vectorize mismatched types (PR #134436)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 4 12:07:32 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-vectorizers
Author: Anshil Gandhi (gandhi56)
<details>
<summary>Changes</summary>
After collecting equivalence classes, loop over each distinct pair of them and check if they could be merged into one.
Consider classes A and B such that their leaders differ only by their scalar bitwidth. (We do not merge them otherwise.) Let N be the scalar bitwidth of the leader instruction in A. Iterate over all instructions in B and ensure their total bitwidths match the total bitwidth of the leader instruction of A. Finally, cast each instruction in B with a mismatched type to an intN type.
---
Full diff: https://github.com/llvm/llvm-project/pull/134436.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+81-1)
- (modified) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll (+27-3)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 04b392829f0d7..c94f10fb8b855 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,10 @@ class Vectorizer {
Instruction *ChainElem, Instruction *ChainBegin,
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+ /// Merge the equivalence classes if casts could be inserted in one to match
+ /// the scalar bitwidth of the instructions in the other class.
+ void insertCastsToMergeClasses(EquivalenceClassMap &EQClasses);
+
/// Merges the equivalence classes if they have underlying objects that differ
/// by one level of indirection (i.e., one is a getelementptr and the other is
/// the base pointer in that getelementptr).
@@ -1310,6 +1314,82 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
return std::nullopt;
}
+void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
+ if (EQClasses.size() < 2)
+ return;
+
+ // Loop over all equivalence classes and try to merge them. Keep track of
+ // classes that are merged into others.
+ DenseSet<EqClassKey> ClassesToErase;
+ for (auto EC1 : EQClasses) {
+ for (auto EC2 : EQClasses) {
+ if (ClassesToErase.contains(EC2.first) || EC1 <= EC2)
+ continue;
+
+ auto [Ptr1, AS1, TySize1, IsLoad1] = EC1.first;
+ auto [Ptr2, AS2, TySize2, IsLoad2] = EC2.first;
+
+ // Attempt to merge EC2 into EC1. Skip if the pointers, address spaces or
+ // whether the leader instruction is a load/store are different. Also skip
+ // if the scalar bitwidth of the first equivalence class is smaller than
+ // the second one to avoid reconsidering the same equivalence class pair.
+ if (Ptr1 != Ptr2 || AS1 != AS2 || IsLoad1 != IsLoad2 || TySize1 < TySize2)
+ continue;
+
+ // Ensure all instructions in EC2 can be bitcasted into NewTy.
+ /// TODO: NewTyBits is needed as stuctured binded variables cannot be
+ /// captured by a lambda until C++20.
+ auto NewTyBits = std::get<2>(EC1.first);
+ if (any_of(EC2.second, [&](Instruction *I) {
+ return DL.getTypeSizeInBits(getLoadStoreType(I)) != NewTyBits;
+ }))
+ continue;
+
+ // Create a new type for the equivalence class.
+ /// TODO: NewTy should be an FP type for an all-FP equivalence class.
+ auto *NewTy = Type::getIntNTy(EC2.second[0]->getContext(), NewTyBits);
+ for (auto *Inst : EC2.second) {
+ auto *Ptr = getLoadStorePointerOperand(Inst);
+ auto *OrigTy = Inst->getType();
+ if (OrigTy == NewTy)
+ continue;
+ if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+ Builder.SetInsertPoint(LI->getIterator());
+ auto *NewLoad = Builder.CreateLoad(NewTy, Ptr);
+ auto *Cast = Builder.CreateBitOrPointerCast(
+ NewLoad, OrigTy, NewLoad->getName() + ".cast");
+ LI->replaceAllUsesWith(Cast);
+ LI->eraseFromParent();
+ EQClasses[EC1.first].emplace_back(NewLoad);
+ } else {
+ auto *SI = cast<StoreInst>(Inst);
+ Builder.SetInsertPoint(SI->getIterator());
+ auto *Cast = Builder.CreateBitOrPointerCast(
+ SI->getValueOperand(), NewTy,
+ SI->getValueOperand()->getName() + ".cast");
+ auto *NewStore = Builder.CreateStore(
+ Cast, getLoadStorePointerOperand(SI), SI->isVolatile());
+ SI->eraseFromParent();
+ EQClasses[EC1.first].emplace_back(NewStore);
+ }
+ }
+
+ // Sort the instructions in the equivalence class by their order in the
+ // basic block. This is important to ensure that the instructions are
+ // vectorized in the correct order.
+ std::sort(EQClasses[EC1.first].begin(), EQClasses[EC1.first].end(),
+ [](Instruction *A, Instruction *B) {
+ return A && B && A->comesBefore(B);
+ });
+ ClassesToErase.insert(EC2.first);
+ }
+ }
+
+ // Erase the equivalence classes that were merged into others.
+ for (auto Key : ClassesToErase)
+ EQClasses.erase(Key);
+}
+
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
if (EQClasses.size() < 2) // There is nothing to merge.
return;
@@ -1495,7 +1575,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
/*IsLoad=*/LI != nullptr}]
.emplace_back(&I);
}
-
+ insertCastsToMergeClasses(Ret);
mergeEquivalenceClasses(Ret);
return Ret;
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index ede2e4066c263..c364bc2da4c5d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -95,10 +95,10 @@ entry:
ret void
}
-; Ideally this would be merged
; CHECK-LABEL: @merge_load_i32_v2i16(
-; CHECK: load i32,
-; CHECK: load <2 x i16>
+; CHECK: load <2 x i32>
+; CHECK: extractelement <2 x i32> %0, i32 0
+; CHECK: extractelement <2 x i32> %0, i32 1
define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
entry:
%a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
@@ -111,3 +111,27 @@ entry:
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+
+; CHECK-LABEL: @merge_i32_2i16_float_4i8(
+; CHECK: load <4 x i32>
+; CHECK: store <2 x i32>
+; CHECK: store <2 x i32>
+define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+ %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1
+ %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
+ %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2
+ %load3 = load float, ptr addrspace(1) %gep3, align 4
+ %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3
+ %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
+ %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
+ store i32 %load1, ptr addrspace(2) %store.gep1, align 4
+ %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1
+ store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4
+ %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2
+ store float %load3, ptr addrspace(2) %store.gep3, align 4
+ %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3
+ store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/134436
More information about the llvm-commits
mailing list