[llvm] [LSV] Insert casts to vectorize mismatched types (PR #134436)
Anshil Gandhi via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 4 12:06:57 PDT 2025
https://github.com/gandhi56 created https://github.com/llvm/llvm-project/pull/134436
After collecting equivalence classes, loop over each distinct pair of them and check if they could be merged into one.
Consider classes A and B such that their leaders differ only by their scalar bitwidth. (We do not merge them otherwise.) Let N be the scalar bitwidth of the leader instruction in A. Iterate over all instructions in B and ensure their total bitwidths match the total bitwidth of the leader instruction of A. Finally, cast each instruction in B with a mismatched type to an intN type.
>From 4bc8fa8859ceb4a22a0b9889b17395972e152da3 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 29 Mar 2025 18:14:27 -0400
Subject: [PATCH 1/3] [NFC][LSV] Precommit tests
This commit adds tests to introduce bitcasts
to vectorize loads and stores.
---
.../AMDGPU/merge-vectors.ll | 29 +++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index ede2e4066c263..b9a948f46ea3b 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -111,3 +111,32 @@ entry:
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+
+; CHECK-LABEL: @merge_i32_2i16_float_4i8(
+; CHECK: load i32
+; CHECK: load <2 x i16>
+; CHECK: load float
+; CHECK: load <4 x i8>
+; CHECK: store i32
+; CHECK: store <2 x i16>
+; CHECK: store float
+; CHECK: store <4 x i8>
+define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+ %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1
+ %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
+ %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2
+ %load3 = load float, ptr addrspace(1) %gep3, align 4
+ %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3
+ %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
+ %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
+ store i32 %load1, ptr addrspace(2) %store.gep1, align 4
+ %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1
+ store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4
+ %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2
+ store float %load3, ptr addrspace(2) %store.gep3, align 4
+ %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3
+ store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4
+ ret void
+}
>From 42a70c79317235dfacdc06cb4427a416e8fd294d Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 29 Mar 2025 18:14:27 -0400
Subject: [PATCH 2/3] [LSV] Precommit tests
This commit adds tests to introduce bitcasts
for increased vectorization of loads and stores.
NFC.
---
.../AMDGPU/insert-casts-vectorize.ll | 89 +++++++++++++++++++
1 file changed, 89 insertions(+)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
new file mode 100644
index 0000000000000..a1bccd4665414
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - < %s | FileCheck %s
+
+define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @merge_i32_2i16_float_4i8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR]], i64 2
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR]], i64 3
+; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr, i64 1
+ %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
+ %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr, i64 2
+ %load3 = load float, ptr addrspace(1) %gep3, align 4
+ %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr, i64 3
+ %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
+ ret void
+}
+
+define void @no_merge_i32_i16(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @no_merge_i32_i16(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 1
+ %load2 = load i16, ptr addrspace(1) %gep2, align 4
+ ret void
+}
+
+define void @merge_i64_double_ptr(ptr addrspace(1) %ptr, ptr addrspace(2) %ptr2) {
+; CHECK-LABEL: define void @merge_i64_double_ptr(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i64>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <3 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <3 x i64> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD22]] to double
+; CHECK-NEXT: [[LOAD33:%.*]] = extractelement <3 x i64> [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[LOAD33]] to ptr
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LOAD11]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP2]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP5]], i32 1
+; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds i64, ptr addrspace(1) %ptr, i64 0
+ %gep2 = getelementptr inbounds double, ptr addrspace(1) %ptr, i64 1
+ %gep3 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 2
+ %load1 = load i64, ptr addrspace(1) %gep1, align 4
+ %load2 = load double, ptr addrspace(1) %gep2, align 4
+ %load3 = load ptr, ptr addrspace(1) %gep3, align 4
+ store i64 %load1, ptr addrspace(1) %gep1, align 4
+ store double %load2, ptr addrspace(1) %gep2, align 4
+ store ptr %load3, ptr addrspace(1) %gep3, align 4
+ ret void
+}
+
+define void @merge_i16_half(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @merge_i16_half(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
+; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[LOAD22]] to half
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 0
+ %load1 = load i16, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds half, ptr addrspace(1) %ptr, i64 1
+ %load2 = load half, ptr addrspace(1) %gep2, align 4
+ ret void
+}
>From 49d9fabc026a9975a63b96e7fe27c383f72a490d Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 29 Mar 2025 18:14:27 -0400
Subject: [PATCH 3/3] [LSV] Insert casts to vectorize mismatched types
After collecting equivalence classes, loop over
each distinct pair of them and check if they could
be merged into one.
Consider classes A and B such that their leaders
differ only by their scalar bitwidth. (We do not
merge them otherwise.) Let N be the scalar
bitwidth of the leader instruction in A. Iterate
over all instructions in B and ensure their total
bitwidths match the total bitwidth of the leader
instruction of A. Finally, cast each instruction
in B with a mismatched type to an intN type.
---
.../Vectorize/LoadStoreVectorizer.cpp | 82 ++++++++++++++++-
.../AMDGPU/insert-casts-vectorize.ll | 89 -------------------
.../AMDGPU/merge-vectors.ll | 17 ++--
3 files changed, 87 insertions(+), 101 deletions(-)
delete mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 04b392829f0d7..c94f10fb8b855 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,10 @@ class Vectorizer {
Instruction *ChainElem, Instruction *ChainBegin,
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+ /// Merge the equivalence classes if casts could be inserted in one to match
+ /// the scalar bitwidth of the instructions in the other class.
+ void insertCastsToMergeClasses(EquivalenceClassMap &EQClasses);
+
/// Merges the equivalence classes if they have underlying objects that differ
/// by one level of indirection (i.e., one is a getelementptr and the other is
/// the base pointer in that getelementptr).
@@ -1310,6 +1314,82 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
return std::nullopt;
}
+void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
+ if (EQClasses.size() < 2)
+ return;
+
+ // Loop over all equivalence classes and try to merge them. Keep track of
+ // classes that are merged into others.
+ DenseSet<EqClassKey> ClassesToErase;
+ for (auto EC1 : EQClasses) {
+ for (auto EC2 : EQClasses) {
+ if (ClassesToErase.contains(EC2.first) || EC1 <= EC2)
+ continue;
+
+ auto [Ptr1, AS1, TySize1, IsLoad1] = EC1.first;
+ auto [Ptr2, AS2, TySize2, IsLoad2] = EC2.first;
+
+ // Attempt to merge EC2 into EC1. Skip if the pointers, address spaces or
+ // whether the leader instruction is a load/store are different. Also skip
+ // if the scalar bitwidth of the first equivalence class is smaller than
+ // the second one to avoid reconsidering the same equivalence class pair.
+ if (Ptr1 != Ptr2 || AS1 != AS2 || IsLoad1 != IsLoad2 || TySize1 < TySize2)
+ continue;
+
+ // Ensure all instructions in EC2 can be bitcasted into NewTy.
+ /// TODO: NewTyBits is needed as stuctured binded variables cannot be
+ /// captured by a lambda until C++20.
+ auto NewTyBits = std::get<2>(EC1.first);
+ if (any_of(EC2.second, [&](Instruction *I) {
+ return DL.getTypeSizeInBits(getLoadStoreType(I)) != NewTyBits;
+ }))
+ continue;
+
+ // Create a new type for the equivalence class.
+ /// TODO: NewTy should be an FP type for an all-FP equivalence class.
+ auto *NewTy = Type::getIntNTy(EC2.second[0]->getContext(), NewTyBits);
+ for (auto *Inst : EC2.second) {
+ auto *Ptr = getLoadStorePointerOperand(Inst);
+ auto *OrigTy = Inst->getType();
+ if (OrigTy == NewTy)
+ continue;
+ if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+ Builder.SetInsertPoint(LI->getIterator());
+ auto *NewLoad = Builder.CreateLoad(NewTy, Ptr);
+ auto *Cast = Builder.CreateBitOrPointerCast(
+ NewLoad, OrigTy, NewLoad->getName() + ".cast");
+ LI->replaceAllUsesWith(Cast);
+ LI->eraseFromParent();
+ EQClasses[EC1.first].emplace_back(NewLoad);
+ } else {
+ auto *SI = cast<StoreInst>(Inst);
+ Builder.SetInsertPoint(SI->getIterator());
+ auto *Cast = Builder.CreateBitOrPointerCast(
+ SI->getValueOperand(), NewTy,
+ SI->getValueOperand()->getName() + ".cast");
+ auto *NewStore = Builder.CreateStore(
+ Cast, getLoadStorePointerOperand(SI), SI->isVolatile());
+ SI->eraseFromParent();
+ EQClasses[EC1.first].emplace_back(NewStore);
+ }
+ }
+
+ // Sort the instructions in the equivalence class by their order in the
+ // basic block. This is important to ensure that the instructions are
+ // vectorized in the correct order.
+ std::sort(EQClasses[EC1.first].begin(), EQClasses[EC1.first].end(),
+ [](Instruction *A, Instruction *B) {
+ return A && B && A->comesBefore(B);
+ });
+ ClassesToErase.insert(EC2.first);
+ }
+ }
+
+ // Erase the equivalence classes that were merged into others.
+ for (auto Key : ClassesToErase)
+ EQClasses.erase(Key);
+}
+
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
if (EQClasses.size() < 2) // There is nothing to merge.
return;
@@ -1495,7 +1575,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
/*IsLoad=*/LI != nullptr}]
.emplace_back(&I);
}
-
+ insertCastsToMergeClasses(Ret);
mergeEquivalenceClasses(Ret);
return Ret;
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
deleted file mode 100644
index a1bccd4665414..0000000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - < %s | FileCheck %s
-
-define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define void @merge_i32_2i16_float_4i8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR]], i64 2
-; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR]], i64 3
-; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 0
- %load1 = load i32, ptr addrspace(1) %gep1, align 4
- %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr, i64 1
- %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
- %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr, i64 2
- %load3 = load float, ptr addrspace(1) %gep3, align 4
- %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr, i64 3
- %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
- ret void
-}
-
-define void @no_merge_i32_i16(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define void @no_merge_i32_i16(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 0
- %load1 = load i32, ptr addrspace(1) %gep1, align 4
- %gep2 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 1
- %load2 = load i16, ptr addrspace(1) %gep2, align 4
- ret void
-}
-
-define void @merge_i64_double_ptr(ptr addrspace(1) %ptr, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i64_double_ptr(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i64>, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <3 x i64> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <3 x i64> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD22]] to double
-; CHECK-NEXT: [[LOAD33:%.*]] = extractelement <3 x i64> [[TMP1]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[LOAD33]] to ptr
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LOAD11]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP2]] to i64
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP5]], i32 1
-; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds i64, ptr addrspace(1) %ptr, i64 0
- %gep2 = getelementptr inbounds double, ptr addrspace(1) %ptr, i64 1
- %gep3 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 2
- %load1 = load i64, ptr addrspace(1) %gep1, align 4
- %load2 = load double, ptr addrspace(1) %gep2, align 4
- %load3 = load ptr, ptr addrspace(1) %gep3, align 4
- store i64 %load1, ptr addrspace(1) %gep1, align 4
- store double %load2, ptr addrspace(1) %gep2, align 4
- store ptr %load3, ptr addrspace(1) %gep3, align 4
- ret void
-}
-
-define void @merge_i16_half(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define void @merge_i16_half(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[LOAD22]] to half
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 0
- %load1 = load i16, ptr addrspace(1) %gep1, align 4
- %gep2 = getelementptr inbounds half, ptr addrspace(1) %ptr, i64 1
- %load2 = load half, ptr addrspace(1) %gep2, align 4
- ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index b9a948f46ea3b..c364bc2da4c5d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -95,10 +95,10 @@ entry:
ret void
}
-; Ideally this would be merged
; CHECK-LABEL: @merge_load_i32_v2i16(
-; CHECK: load i32,
-; CHECK: load <2 x i16>
+; CHECK: load <2 x i32>
+; CHECK: extractelement <2 x i32> %0, i32 0
+; CHECK: extractelement <2 x i32> %0, i32 1
define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
entry:
%a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
@@ -113,14 +113,9 @@ attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
; CHECK-LABEL: @merge_i32_2i16_float_4i8(
-; CHECK: load i32
-; CHECK: load <2 x i16>
-; CHECK: load float
-; CHECK: load <4 x i8>
-; CHECK: store i32
-; CHECK: store <2 x i16>
-; CHECK: store float
-; CHECK: store <4 x i8>
+; CHECK: load <4 x i32>
+; CHECK: store <2 x i32>
+; CHECK: store <2 x i32>
define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
%gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
%load1 = load i32, ptr addrspace(1) %gep1, align 4
More information about the llvm-commits
mailing list