[llvm] [VectorCombine] Try to scalarize vector loads feeding bitcast instructions. (PR #164682)
Julian Nagele via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 04:59:00 PST 2025
https://github.com/juliannagele updated https://github.com/llvm/llvm-project/pull/164682
>From 6fcd3779ac2298f9f38e66900351c2d83aa22330 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j_nagele at apple.com>
Date: Wed, 22 Oct 2025 19:48:58 +0100
Subject: [PATCH 1/4] [VectorCombine] Try to scalarize vector loads feeding
bitcast instructions.
---
.../Transforms/Vectorize/VectorCombine.cpp | 144 ++++++++++++++----
.../AArch64/load-bitcast-scalarization.ll | 136 +++++++++++++++++
2 files changed, 252 insertions(+), 28 deletions(-)
create mode 100644 llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00da11dc8..e045282c387fe 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -129,7 +129,9 @@ class VectorCombine {
bool foldExtractedCmps(Instruction &I);
bool foldBinopOfReductions(Instruction &I);
bool foldSingleElementStore(Instruction &I);
- bool scalarizeLoadExtract(Instruction &I);
+ bool scalarizeLoad(Instruction &I);
+ bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
+ bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
bool scalarizeExtExtract(Instruction &I);
bool foldConcatOfBoolMasks(Instruction &I);
bool foldPermuteOfBinops(Instruction &I);
@@ -1845,49 +1847,42 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
return false;
}
-/// Try to scalarize vector loads feeding extractelement instructions.
-bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
- if (!TTI.allowVectorElementIndexingUsingGEP())
- return false;
-
+/// Try to scalarize vector loads feeding extractelement or bitcast
+/// instructions.
+bool VectorCombine::scalarizeLoad(Instruction &I) {
Value *Ptr;
if (!match(&I, m_Load(m_Value(Ptr))))
return false;
auto *LI = cast<LoadInst>(&I);
auto *VecTy = cast<VectorType>(LI->getType());
- if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
+ if (!VecTy || LI->isVolatile() ||
+ !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
return false;
- InstructionCost OriginalCost =
- TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
- LI->getPointerAddressSpace(), CostKind);
- InstructionCost ScalarizedCost = 0;
-
+ // Check what type of users we have and ensure no memory modifications betwwen
+ // the load and its users.
+ bool AllExtracts = true;
+ bool AllBitcasts = true;
Instruction *LastCheckedInst = LI;
unsigned NumInstChecked = 0;
- DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
- auto FailureGuard = make_scope_exit([&]() {
- // If the transform is aborted, discard the ScalarizationResults.
- for (auto &Pair : NeedFreeze)
- Pair.second.discard();
- });
- // Check if all users of the load are extracts with no memory modifications
- // between the load and the extract. Compute the cost of both the original
- // code and the scalarized version.
for (User *U : LI->users()) {
- auto *UI = dyn_cast<ExtractElementInst>(U);
- if (!UI || UI->getParent() != LI->getParent())
+ auto *UI = dyn_cast<Instruction>(U);
+ if (!UI || UI->getParent() != LI->getParent() || UI->use_empty())
return false;
- // If any extract is waiting to be erased, then bail out as this will
+ // If any user is waiting to be erased, then bail out as this will
// distort the cost calculation and possibly lead to infinite loops.
if (UI->use_empty())
return false;
- // Check if any instruction between the load and the extract may modify
- // memory.
+ if (!isa<ExtractElementInst>(UI))
+ AllExtracts = false;
+ if (!isa<BitCastInst>(UI))
+ AllBitcasts = false;
+
+ // Check if any instruction between the load and the user may modify memory.
if (LastCheckedInst->comesBefore(UI)) {
for (Instruction &I :
make_range(std::next(LI->getIterator()), UI->getIterator())) {
@@ -1899,6 +1894,35 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
}
LastCheckedInst = UI;
}
+ }
+
+ if (AllExtracts)
+ return scalarizeLoadExtract(LI, VecTy, Ptr);
+ if (AllBitcasts)
+ return scalarizeLoadBitcast(LI, VecTy, Ptr);
+ return false;
+}
+
+/// Try to scalarize vector loads feeding extractelement instructions.
+bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
+ Value *Ptr) {
+ if (!TTI.allowVectorElementIndexingUsingGEP())
+ return false;
+
+ InstructionCost OriginalCost =
+ TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+ InstructionCost ScalarizedCost = 0;
+
+ DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
+ auto FailureGuard = make_scope_exit([&]() {
+ // If the transform is aborted, discard the ScalarizationResults.
+ for (auto &Pair : NeedFreeze)
+ Pair.second.discard();
+ });
+
+ for (User *U : LI->users()) {
+ auto *UI = cast<ExtractElementInst>(U);
auto ScalarIdx =
canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
@@ -1920,7 +1944,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
nullptr, nullptr, CostKind);
}
- LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
+ LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
<< "\n LoadExtractCost: " << OriginalCost
<< " vs ScalarizedCost: " << ScalarizedCost << "\n");
@@ -1966,6 +1990,70 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
return true;
}
+/// Try to scalarize vector loads feeding bitcast instructions.
+bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
+ Value *Ptr) {
+ InstructionCost OriginalCost =
+ TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+
+ Type *TargetScalarType = nullptr;
+ unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
+
+ for (User *U : LI->users()) {
+ auto *BC = cast<BitCastInst>(U);
+
+ Type *DestTy = BC->getDestTy();
+ if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
+ return false;
+
+ unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
+ if (DestBitWidth != VecBitWidth)
+ return false;
+
+ // All bitcasts should target the same scalar type.
+ if (!TargetScalarType)
+ TargetScalarType = DestTy;
+ else if (TargetScalarType != DestTy)
+ return false;
+
+ OriginalCost +=
+ TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
+ TTI.getCastContextHint(BC), CostKind, BC);
+ }
+
+ if (!TargetScalarType || LI->user_empty())
+ return false;
+ InstructionCost ScalarizedCost =
+ TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+
+ LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
+ << "\n OriginalCost: " << OriginalCost
+ << " vs ScalarizedCost: " << ScalarizedCost << "\n");
+
+ if (ScalarizedCost >= OriginalCost)
+ return false;
+
+ // Ensure we add the load back to the worklist BEFORE its users so they can
+ // erased in the correct order.
+ Worklist.push(LI);
+
+ Builder.SetInsertPoint(LI);
+ auto *ScalarLoad =
+ Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
+ ScalarLoad->setAlignment(LI->getAlign());
+ ScalarLoad->copyMetadata(*LI);
+
+ // Replace all bitcast users with the scalar load.
+ for (User *U : LI->users()) {
+ auto *BC = cast<BitCastInst>(U);
+ replaceValue(*BC, *ScalarLoad, false);
+ }
+
+ return true;
+}
+
bool VectorCombine::scalarizeExtExtract(Instruction &I) {
if (!TTI.allowVectorElementIndexingUsingGEP())
return false;
@@ -4555,7 +4643,7 @@ bool VectorCombine::run() {
if (IsVectorType) {
if (scalarizeOpOrCmp(I))
return true;
- if (scalarizeLoadExtract(I))
+ if (scalarizeLoad(I))
return true;
if (scalarizeExtExtract(I))
return true;
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
new file mode 100644
index 0000000000000..464e5129262bc
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
+
+define i32 @load_v4i8_bitcast_to_i32(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_bitcast_to_i32(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[R_SCALAR:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT: ret i32 [[R_SCALAR]]
+;
+ %lv = load <4 x i8>, ptr %x
+ %r = bitcast <4 x i8> %lv to i32
+ ret i32 %r
+}
+
+define i64 @load_v2i32_bitcast_to_i64(ptr %x) {
+; CHECK-LABEL: define i64 @load_v2i32_bitcast_to_i64(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[R_SCALAR:%.*]] = load i64, ptr [[X]], align 8
+; CHECK-NEXT: ret i64 [[R_SCALAR]]
+;
+ %lv = load <2 x i32>, ptr %x
+ %r = bitcast <2 x i32> %lv to i64
+ ret i64 %r
+}
+
+define float @load_v4i8_bitcast_to_float(ptr %x) {
+; CHECK-LABEL: define float @load_v4i8_bitcast_to_float(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[R_SCALAR:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT: ret float [[R_SCALAR]]
+;
+ %lv = load <4 x i8>, ptr %x
+ %r = bitcast <4 x i8> %lv to float
+ ret float %r
+}
+
+define float @load_v2i16_bitcast_to_float(ptr %x) {
+; CHECK-LABEL: define float @load_v2i16_bitcast_to_float(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[R_SCALAR:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT: ret float [[R_SCALAR]]
+;
+ %lv = load <2 x i16>, ptr %x
+ %r = bitcast <2 x i16> %lv to float
+ ret float %r
+}
+
+define double @load_v4i16_bitcast_to_double(ptr %x) {
+; CHECK-LABEL: define double @load_v4i16_bitcast_to_double(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[LV:%.*]] = load <4 x i16>, ptr [[X]], align 8
+; CHECK-NEXT: [[R_SCALAR:%.*]] = bitcast <4 x i16> [[LV]] to double
+; CHECK-NEXT: ret double [[R_SCALAR]]
+;
+ %lv = load <4 x i16>, ptr %x
+ %r = bitcast <4 x i16> %lv to double
+ ret double %r
+}
+
+define double @load_v2i32_bitcast_to_double(ptr %x) {
+; CHECK-LABEL: define double @load_v2i32_bitcast_to_double(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[LV:%.*]] = load <2 x i32>, ptr [[X]], align 8
+; CHECK-NEXT: [[R_SCALAR:%.*]] = bitcast <2 x i32> [[LV]] to double
+; CHECK-NEXT: ret double [[R_SCALAR]]
+;
+ %lv = load <2 x i32>, ptr %x
+ %r = bitcast <2 x i32> %lv to double
+ ret double %r
+}
+
+; Multiple users with the same bitcast type should be scalarized.
+define i32 @load_v4i8_bitcast_multiple_users_same_type(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_bitcast_multiple_users_same_type(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[LV_SCALAR:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV_SCALAR]], [[LV_SCALAR]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %lv = load <4 x i8>, ptr %x
+ %r1 = bitcast <4 x i8> %lv to i32
+ %r2 = bitcast <4 x i8> %lv to i32
+ %add = add i32 %r1, %r2
+ ret i32 %add
+}
+
+; Different bitcast types should not be scalarized.
+define i32 @load_v4i8_bitcast_multiple_users_different_types(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_bitcast_multiple_users_different_types(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4
+; CHECK-NEXT: [[R1:%.*]] = bitcast <4 x i8> [[LV]] to i32
+; CHECK-NEXT: [[R2:%.*]] = bitcast <4 x i8> [[LV]] to float
+; CHECK-NEXT: [[R2_INT:%.*]] = bitcast float [[R2]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[R1]], [[R2_INT]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %lv = load <4 x i8>, ptr %x
+ %r1 = bitcast <4 x i8> %lv to i32
+ %r2 = bitcast <4 x i8> %lv to float
+ %r2.int = bitcast float %r2 to i32
+ %add = add i32 %r1, %r2.int
+ ret i32 %add
+}
+
+; Bitcast to vector should not be scalarized.
+define <2 x i16> @load_v4i8_bitcast_to_vector(ptr %x) {
+; CHECK-LABEL: define <2 x i16> @load_v4i8_bitcast_to_vector(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4
+; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i8> [[LV]] to <2 x i16>
+; CHECK-NEXT: ret <2 x i16> [[R]]
+;
+ %lv = load <4 x i8>, ptr %x
+ %r = bitcast <4 x i8> %lv to <2 x i16>
+ ret <2 x i16> %r
+}
+
+; Load with both bitcast users and other users should not be scalarized.
+define i32 @load_v4i8_mixed_users(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_mixed_users(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4
+; CHECK-NEXT: [[R1:%.*]] = bitcast <4 x i8> [[LV]] to i32
+; CHECK-NEXT: [[R2:%.*]] = extractelement <4 x i8> [[LV]], i32 0
+; CHECK-NEXT: [[R2_EXT:%.*]] = zext i8 [[R2]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[R1]], [[R2_EXT]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %lv = load <4 x i8>, ptr %x
+ %r1 = bitcast <4 x i8> %lv to i32
+ %r2 = extractelement <4 x i8> %lv, i32 0
+ %r2.ext = zext i8 %r2 to i32
+ %add = add i32 %r1, %r2.ext
+ ret i32 %add
+}
>From 34b4f53fa9c8ec2a3d96e1c86b8b7073f3d91265 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j_nagele at apple.com>
Date: Tue, 28 Oct 2025 18:36:53 +0000
Subject: [PATCH 2/4] fixup! [VectorCombine] Try to scalarize vector loads
feeding bitcast instructions.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ++-
.../VectorCombine/AArch64/load-bitcast-scalarization.ll | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e045282c387fe..0c8a2820ede97 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2022,8 +2022,9 @@ bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
TTI.getCastContextHint(BC), CostKind, BC);
}
- if (!TargetScalarType || LI->user_empty())
+ if (!TargetScalarType)
return false;
+ assert(!LI->user_empty() && "Unexpected load without bitcast users");
InstructionCost ScalarizedCost =
TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
LI->getPointerAddressSpace(), CostKind);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
index 464e5129262bc..ca3df3310a795 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
define i32 @load_v4i8_bitcast_to_i32(ptr %x) {
>From 401c9337f178fd9d0e792c83cb889b85fdfab8cf Mon Sep 17 00:00:00 2001
From: Julian Nagele <j_nagele at apple.com>
Date: Wed, 5 Nov 2025 17:41:27 +0000
Subject: [PATCH 3/4] Add test that shows scalarization of load-ext-extract
sequence.
---
.../VectorCombine/AArch64/load-ext-extract.ll | 32 +++++++++++++++++++
1 file changed, 32 insertions(+)
create mode 100644 llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll
new file mode 100644
index 0000000000000..0bf1640b43a66
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine,dce,vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
+
+define noundef i32 @load_ext_extract(ptr %src) {
+; CHECK-LABEL: define noundef i32 @load_ext_extract(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP14]], 16
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 255
+; CHECK-NEXT: [[TMP18:%.*]] = lshr i32 [[TMP14]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 255
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP14]], 255
+; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP17]]
+; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP15]]
+; CHECK-NEXT: ret i32 [[ADD3]]
+;
+entry:
+ %x = load <4 x i8>, ptr %src, align 4
+ %ext = zext nneg <4 x i8> %x to <4 x i32>
+ %ext.0 = extractelement <4 x i32> %ext, i64 0
+ %ext.1 = extractelement <4 x i32> %ext, i64 1
+ %ext.2 = extractelement <4 x i32> %ext, i64 2
+ %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+ %add1 = add i32 %ext.0, %ext.1
+ %add2 = add i32 %add1, %ext.2
+ %add3 = add i32 %add2, %ext.3
+ ret i32 %add3
+}
>From 504f3e8916b39e35d4a96c66fa1347aa301f1ef6 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j_nagele at apple.com>
Date: Fri, 7 Nov 2025 12:58:29 +0000
Subject: [PATCH 4/4] !fixup address comments
---
.../Transforms/Vectorize/VectorCombine.cpp | 22 +++++++++----------
.../AArch64/scalarize-load-ext-extract.ll} | 12 +++++-----
2 files changed, 17 insertions(+), 17 deletions(-)
rename llvm/test/Transforms/{VectorCombine/AArch64/load-ext-extract.ll => PhaseOrdering/AArch64/scalarize-load-ext-extract.ll} (69%)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fdf2650a50a3a..b79f1d974e2fd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1856,20 +1856,19 @@ bool VectorCombine::scalarizeLoad(Instruction &I) {
auto *LI = cast<LoadInst>(&I);
auto *VecTy = cast<VectorType>(LI->getType());
- if (!VecTy || LI->isVolatile() ||
- !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
+ if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
return false;
- // Check what type of users we have and ensure no memory modifications betwwen
- // the load and its users.
bool AllExtracts = true;
bool AllBitcasts = true;
Instruction *LastCheckedInst = LI;
unsigned NumInstChecked = 0;
+ // Check what type of users we have and ensure no memory modifications betwwen
+ // the load and its users.
for (User *U : LI->users()) {
auto *UI = dyn_cast<Instruction>(U);
- if (!UI || UI->getParent() != LI->getParent() || UI->use_empty())
+ if (!UI || UI->getParent() != LI->getParent())
return false;
// If any user is waiting to be erased, then bail out as this will
@@ -1909,11 +1908,6 @@ bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
if (!TTI.allowVectorElementIndexingUsingGEP())
return false;
- InstructionCost OriginalCost =
- TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
- LI->getPointerAddressSpace(), CostKind);
- InstructionCost ScalarizedCost = 0;
-
DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
auto FailureGuard = make_scope_exit([&]() {
// If the transform is aborted, discard the ScalarizationResults.
@@ -1921,6 +1915,11 @@ bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
Pair.second.discard();
});
+ InstructionCost OriginalCost =
+ TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
+ LI->getPointerAddressSpace(), CostKind);
+ InstructionCost ScalarizedCost = 0;
+
for (User *U : LI->users()) {
auto *UI = cast<ExtractElementInst>(U);
@@ -2011,7 +2010,7 @@ bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
if (DestBitWidth != VecBitWidth)
return false;
- // All bitcasts should target the same scalar type.
+ // All bitcasts must target the same scalar type.
if (!TargetScalarType)
TargetScalarType = DestTy;
else if (TargetScalarType != DestTy)
@@ -2024,6 +2023,7 @@ bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
if (!TargetScalarType)
return false;
+
assert(!LI->user_empty() && "Unexpected load without bitcast users");
InstructionCost ScalarizedCost =
TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll
similarity index 69%
rename from llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll
rename to llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll
index 0bf1640b43a66..f7918b0e0a798 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-ext-extract.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=vector-combine,dce,vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
+; RUN: opt -O3 -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
define noundef i32 @load_ext_extract(ptr %src) {
-; CHECK-LABEL: define noundef i32 @load_ext_extract(
-; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-LABEL: define noundef range(i32 0, 1021) i32 @load_ext_extract(
+; CHECK-SAME: ptr readonly captures(none) [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[SRC]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
@@ -12,9 +12,9 @@ define noundef i32 @load_ext_extract(ptr %src) {
; CHECK-NEXT: [[TMP18:%.*]] = lshr i32 [[TMP14]], 8
; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 255
; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP14]], 255
-; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP20]], [[TMP19]]
-; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP17]]
-; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP15]]
+; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i32 [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[TMP17]]
+; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i32 [[ADD2]], [[TMP15]]
; CHECK-NEXT: ret i32 [[ADD3]]
;
entry:
More information about the llvm-commits
mailing list