[llvm] [VPlan] Narrow VPWidenCastRecipe to scalar cast recipe. (PR #166514)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 30 01:32:30 PST 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/166514
>From c2710daffb782cd3fbf1453ff180b31720019f8d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 10 Jun 2025 07:57:49 -0700
Subject: [PATCH 1/6] IsSingleScalar
---
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 4 ++++
.../Transforms/LoopVectorize/single-scalar-cast-minbw.ll | 5 +++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4fc9a3f20ff46..085a2b725bb31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -290,6 +290,10 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
return preservesUniformity(WidenR->getOpcode()) &&
all_of(WidenR->operands(), isSingleScalar);
}
+ if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+ return PreservesUniformity(CastR->getOpcode()) &&
+ all_of(CastR->operands(), isSingleScalar);
+ }
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
(preservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
index fb25b2bc7b906..3f3caa9d64cbc 100644
--- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -18,8 +18,9 @@ define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
-; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = xor i8 [[TMP2]], [[TMP5]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
>From cbac76ebb91095e71847254c16d1e4698261645d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 4 Nov 2025 00:46:31 -0800
Subject: [PATCH 2/6] rebase and update
---
.../LoopVectorize/AArch64/predicated-costs.ll | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 14398eebc6674..b59d0496bef21 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -76,15 +76,15 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP29]], 1
; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
-; CHECK-NEXT: [[TMP105:%.*]] = or i64 [[TMP104]], 1
-; CHECK-NEXT: store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
@@ -92,9 +92,7 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK: [[PRED_STORE_IF17]]:
; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
-; CHECK-NEXT: [[TMP111:%.*]] = or i64 [[TMP110]], 1
-; CHECK-NEXT: store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
@@ -102,9 +100,7 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK: [[PRED_STORE_IF19]]:
; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
-; CHECK-NEXT: [[TMP117:%.*]] = or i64 [[TMP116]], 1
-; CHECK-NEXT: store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
@@ -112,9 +108,7 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK: [[PRED_STORE_IF21]]:
; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
-; CHECK-NEXT: [[TMP123:%.*]] = or i64 [[TMP122]], 1
-; CHECK-NEXT: store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
>From 81f3d742515c6a23256780b65ea6ffedcf6fd174 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 4 Nov 2025 05:16:08 -0800
Subject: [PATCH 3/6] [VPlan] Narrow VPWidenCastRecipe to scalar cast recipe.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 13 +-
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +-
.../AArch64/deterministic-type-shrinkage.ll | 32 +++--
.../LoopVectorize/AArch64/induction-costs.ll | 5 +-
.../AArch64/neon-inloop-reductions.ll | 6 +-
.../partial-reduce-dot-product-epilogue.ll | 21 +--
.../partial-reduce-dot-product-neon.ll | 129 ++++++++----------
.../partial-reduce-incomplete-chains.ll | 13 +-
.../AArch64/partial-reduce-no-dotprod.ll | 10 +-
.../LoopVectorize/AArch64/predicated-costs.ll | 49 ++++---
.../AArch64/reduction-recurrence-costs-sve.ll | 20 +--
.../LoopVectorize/AArch64/store-costs-sve.ll | 37 +++--
.../AArch64/type-shrinkage-insertelt.ll | 8 +-
.../Transforms/LoopVectorize/RISCV/pr88802.ll | 5 +-
.../truncate-to-minimal-bitwidth-cost.ll | 10 +-
.../truncate-to-minimal-bitwidth-evl-crash.ll | 40 +-----
.../LoopVectorize/X86/cost-model.ll | 17 +--
.../Transforms/LoopVectorize/cse-casts.ll | 5 +-
...-order-recurrence-sink-replicate-region.ll | 22 +--
.../LoopVectorize/narrow-to-single-scalar.ll | 20 +--
.../predicatedinst-loop-invariant.ll | 76 ++++++-----
.../scalable-trunc-min-bitwidth.ll | 6 +-
.../LoopVectorize/single-scalar-cast-minbw.ll | 5 +-
.../LoopVectorize/trunc-loads-p16.ll | 5 +-
.../vplan-printing-reductions.ll | 24 ++--
25 files changed, 282 insertions(+), 298 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4b23a2b81f20a..7ee2e218a8926 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1582,8 +1582,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPWidenGEPRecipe,
- VPReplicateRecipe, VPWidenStoreRecipe>(&R))
+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenSelectRecipe,
+ VPWidenGEPRecipe, VPReplicateRecipe, VPWidenStoreRecipe>(&R))
continue;
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
@@ -1673,6 +1673,15 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
}))
continue;
+ if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) {
+ VPBuilder Builder(CastR);
+ auto *Clone = Builder.createScalarCast(
+ CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
+ CastR->getDebugLoc());
+ CastR->replaceAllUsesWith(Clone);
+ CastR->eraseFromParent();
+ continue;
+ }
auto *Clone = new VPReplicateRecipe(
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 085a2b725bb31..56f7d2114bab0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -291,7 +291,7 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
all_of(WidenR->operands(), isSingleScalar);
}
if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
- return PreservesUniformity(CastR->getOpcode()) &&
+ return preservesUniformity(CastR->getOpcode()) &&
all_of(CastR->operands(), isSingleScalar);
}
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index f0664197dcb94..890e4c507f0cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -125,12 +125,13 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[B]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT2]] to <16 x i16>
-; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = mul i16 [[TMP13]], [[B]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP14]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i16> [[TMP1]], splat (i16 8)
; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
@@ -149,12 +150,13 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 992, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT6]] to <8 x i16>
-; CHECK-NEXT: [[TMP8:%.*]] = mul <8 x i16> [[TMP7]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = mul i16 [[TMP15]], [[B]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <8 x i16> poison, i16 [[TMP16]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT6]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i16> [[TMP8]], splat (i16 8)
; CHECK-NEXT: [[TMP10:%.*]] = trunc <8 x i16> [[TMP9]] to <8 x i8>
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -202,9 +204,12 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = trunc <16 x i32> [[BROADCAST_SPLAT]] to <16 x i16>
-; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i16> [[TMP0]], splat (i16 99)
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i16> [[TMP1]], splat (i16 8)
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[C]], <16 x i16> [[TMP2]], <16 x i16> [[TMP1]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x i16> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TMP1]], 99
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i16 [[TMP2]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[C]], i16 [[TMP9]], i16 [[TMP2]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP15]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i16> [[TMP3]] to <16 x i8>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
@@ -225,9 +230,12 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> poison, i32 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT2]] to <8 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP8]], splat (i16 99)
-; CHECK-NEXT: [[TMP10:%.*]] = lshr <8 x i16> [[TMP9]], splat (i16 8)
-; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[C]], <8 x i16> [[TMP10]], <8 x i16> [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = mul i16 [[TMP10]], 99
+; CHECK-NEXT: [[TMP17:%.*]] = lshr i16 [[TMP16]], 8
+; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[C]], i16 [[TMP17]], i16 [[TMP16]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <8 x i16> poison, i16 [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT5]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = trunc <8 x i16> [[TMP11]] to <8 x i8>
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 40db6a53b49e4..53af820c6d8f9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -18,7 +18,10 @@ define i32 @multi_exit_iv_uniform(i32 %a, i64 %N, ptr %dst) {
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[BROADCAST_SPLAT]] to <4 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll
index 22696d0b297d9..db92e10c2e16e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll
@@ -51,10 +51,10 @@ define i32 @mul_used_outside_vpexpression(ptr %src.0, ptr %src.1) {
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 [[INDEX3]]
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[NEXT_GEP6]], align 1
; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT9]] to <4 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
; CHECK-NEXT: [[TMP17]] = add i32 [[VEC_PHI4]], [[TMP16]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index fed979e34e5d5..d5e433b943c96 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -85,7 +85,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -93,7 +93,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
@@ -103,20 +103,23 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT7]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = sext i8 [[TMP16]] to i32
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext i8 [[TMP9]] to i32
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP8]]
; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -140,7 +143,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: while.end.loopexit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret void
@@ -419,7 +422,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index b2be0e1d7a442..f16f913ad813a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -1906,19 +1906,16 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1940,21 +1937,18 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]])
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT4]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1975,19 +1969,16 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32>
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]])
-; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -2026,7 +2017,10 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64>
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2034,17 +2028,14 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -2060,7 +2051,10 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP11]], i64 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64>
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2069,19 +2063,16 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[VEC_PHI1]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP6]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2097,7 +2088,10 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64>
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2105,17 +2099,14 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -2154,7 +2145,10 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2162,17 +2156,14 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -2188,7 +2179,10 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP11]], i64 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2197,19 +2191,16 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i32> [[VEC_PHI1]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2225,7 +2216,10 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2233,17 +2227,14 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -2282,7 +2273,10 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext i8 [[TMP6]] to i16
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2290,17 +2284,14 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -2316,7 +2307,10 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sext i8 [[TMP7]] to i16
+; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP11]], i64 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2325,19 +2319,16 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i32> [[VEC_PHI1]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2353,7 +2344,10 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16>
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext i8 [[TMP6]] to i16
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]]
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -2361,17 +2355,14 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) {
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
index 2060168c531fb..91d5dbc933562 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -82,15 +82,14 @@ define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B)
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15
+; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i16
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: store i16 [[TMP2]], ptr [[DST]], align 2
; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[B]], align 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP3]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT1]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT2]] to <16 x i16>
+; CHECK-NEXT: [[TMP10:%.*]] = zext i8 [[TMP3]] to i16
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP5]], [[TMP1]]
; CHECK-NEXT: [[TMP7]] = add <16 x i16> [[TMP6]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
index a439f5189794a..24420ab21da36 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
@@ -81,8 +81,14 @@ define i40 @partial_reduce_not_known_factor(i32 %a, i32 %b, i16 %N) {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[N_VEC]] to i16
-; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[BROADCAST_SPLAT2]] to <2 x i40>
-; CHECK-NEXT: [[TMP4:%.*]] = sext <2 x i32> [[BROADCAST_SPLAT]] to <2 x i40>
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[BROADCAST_SPLAT2]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i40
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i40> poison, i40 [[TMP11]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i40> [[BROADCAST_SPLATINSERT5]], <2 x i40> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i40
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i40> poison, i40 [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i40> [[BROADCAST_SPLATINSERT3]], <2 x i40> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i40> [[TMP4]], [[TMP3]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index b59d0496bef21..bf62698b8ef8f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -72,43 +72,40 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
-; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT: [[TMP29:%.*]] = zext i8 [[TMP25]] to i64
; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP29]], 1
-; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT: [[TMP27:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
-; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i64> [[TMP27]], i32 0
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP65]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP66]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
; CHECK: [[PRED_STORE_IF17]]:
-; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
-; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i64> [[TMP27]], i32 1
+; CHECK-NEXT: [[TMP69:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP68]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP69]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
; CHECK: [[PRED_STORE_IF19]]:
-; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
-; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i64> [[TMP27]], i32 2
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP71]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP72]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_IF21]]:
-; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
-; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i64> [[TMP27]], i32 3
+; CHECK-NEXT: [[TMP75:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP74]], i64 [[OFF]]
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP75]], align 8, !alias.scope [[META3]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 6f90cbc3d8f51..0e0e4e0fbe6ae 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -67,11 +67,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; VSCALEFORTUNING2-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i32> [[TMP9]], [[TMP10]]
; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = or i32 [[Z]], [[X]]
-; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
-; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and <vscale x 4 x i32> [[TMP12]], splat (i32 1)
-; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP13]], splat (i32 1)
-; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64>
+; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = and i32 [[TMP16]], 1
+; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = xor i32 [[TMP14]], 1
+; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
+; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
+; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP15]]
; VSCALEFORTUNING2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
; VSCALEFORTUNING2-NEXT: [[TMP19:%.*]] = mul nuw i32 [[TMP18]], 4
@@ -194,11 +194,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[TMP14:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; PRED-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i32> [[TMP13]], [[TMP14]]
; PRED-NEXT: [[TMP20:%.*]] = or i32 [[Z]], [[X]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
-; PRED-NEXT: [[TMP16:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP17:%.*]] = and <vscale x 4 x i32> [[TMP16]], splat (i32 1)
-; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP17]], splat (i32 1)
-; PRED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64>
+; PRED-NEXT: [[TMP17:%.*]] = and i32 [[TMP20]], 1
+; PRED-NEXT: [[TMP18:%.*]] = xor i32 [[TMP17]], 1
+; PRED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP18]] to i64
+; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
+; PRED-NEXT: [[TMP19:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; PRED-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP19]]
; PRED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
; PRED-NEXT: [[TMP23:%.*]] = mul nuw i32 [[TMP22]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index d3edad4010bb8..80dbb3d9bf4da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -140,10 +140,13 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i16> poison, i16 [[X]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP5:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP5]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP6:%.*]] = trunc <vscale x 16 x i64> [[BROADCAST_SPLAT3]] to <vscale x 16 x i8>
-; DEFAULT-NEXT: [[TMP13:%.*]] = trunc <vscale x 16 x i16> [[BROADCAST_SPLAT]] to <vscale x 16 x i8>
+; DEFAULT-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP5]] to i8
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP19]], i64 0
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i16> [[BROADCAST_SPLAT]], i32 0
+; DEFAULT-NEXT: [[TMP8:%.*]] = trunc i16 [[TMP20]] to i8
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP8]], i64 0
+; DEFAULT-NEXT: [[TMP13:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT5]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP14:%.*]] = and <vscale x 16 x i8> [[TMP6]], [[TMP13]]
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
; DEFAULT: vector.body:
@@ -165,11 +168,14 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META13:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8>
-; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8>
+; DEFAULT-NEXT: [[TMP21:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META13:![0-9]+]]
+; DEFAULT-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP22]], i64 0
+; DEFAULT-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT8]], <8 x i8> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[BROADCAST_SPLAT5]], i32 0
+; DEFAULT-NEXT: [[TMP23:%.*]] = trunc i16 [[TMP15]] to i8
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[TMP23]], i64 0
+; DEFAULT-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]]
; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; DEFAULT: vec.epilog.vector.body:
@@ -215,15 +221,18 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i16> poison, i16 [[X]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 1000)
-; PRED-NEXT: [[TMP2:%.*]] = trunc <vscale x 16 x i16> [[BROADCAST_SPLAT]] to <vscale x 16 x i8>
+; PRED-NEXT: [[TMP9:%.*]] = extractelement <vscale x 16 x i16> [[BROADCAST_SPLAT]], i32 0
+; PRED-NEXT: [[TMP3:%.*]] = trunc i16 [[TMP9]] to i8
+; PRED-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP3]], i64 0
+; PRED-NEXT: [[TMP2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT4]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT: [[TMP3:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META3:![0-9]+]]
-; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP3]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT: [[TMP4:%.*]] = trunc <vscale x 16 x i64> [[BROADCAST_SPLAT3]] to <vscale x 16 x i8>
+; PRED-NEXT: [[TMP10:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META3:![0-9]+]]
+; PRED-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i8
+; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP11]], i64 0
+; PRED-NEXT: [[TMP4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; PRED-NEXT: [[TMP5:%.*]] = and <vscale x 16 x i8> [[TMP4]], [[TMP2]]
; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP5]], ptr align 1 [[TMP6]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
index 4761cb0d63de7..8977941183df4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -89,12 +89,12 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[C]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP0]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
-; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16>
-; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP6]]
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP4]] to i16
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 0723f16677090..6a2bfc24d0e5d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -14,7 +14,10 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
-; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <vscale x 2 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT8]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index f4c7c6f6fba1b..f50aaa2f83c1a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -193,12 +193,10 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[V]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[N]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[T]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[N]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[T]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT4]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[X]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP9]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP10]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
index 232c354764e1a..6344fea65dc16 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
@@ -50,42 +50,10 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
; CHECK-LABEL: define void @truncate_i16_to_i8_cse(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4294967296, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 4294967296, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 4294967296, [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[SRC]], align 2
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP5]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = trunc <vscale x 8 x i16> [[BROADCAST_SPLAT]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 8
-; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <vscale x 8 x i8> [[TMP6]], i32 [[TMP9]]
-; CHECK-NEXT: store i8 [[TMP10]], ptr null, align 1
-; CHECK-NEXT: store i8 [[TMP10]], ptr [[DST]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4294967296, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[COUNT_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[COUNT_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[SRC]], align 2
; CHECK-NEXT: [[VAL_ZEXT:%.*]] = zext i16 [[VAL]] to i64
; CHECK-NEXT: [[VAL_TRUNC_ZEXT:%.*]] = trunc i64 [[VAL_ZEXT]] to i8
@@ -95,7 +63,7 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
; CHECK-NEXT: [[COUNT_NEXT]] = add i32 [[COUNT]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[COUNT_NEXT]], 0
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -124,6 +92,4 @@ exit: ; preds = %loop
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 801f910c5e13d..5e916638432f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -622,7 +622,10 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[X:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i64
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i64> [[TMP0]], splat (i64 12)
; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -737,17 +740,15 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[X:%.*]], true
+; CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i64
+; CHECK-NEXT: [[Y:%.*]] = shl i64 [[Y1:%.*]], [[TMP1]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[Y]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[X:%.*]] = xor i1 [[X1:%.*]], true
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[X]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i1> [[BROADCAST_SPLAT]] to <2 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[BROADCAST_SPLAT2]], [[TMP1]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 1), [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[BROADCAST_SPLAT2]], [[VEC_PHI]]
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]])
diff --git a/llvm/test/Transforms/LoopVectorize/cse-casts.ll b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
index b6d7a9f81ec9d..ddd235e444494 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
@@ -361,10 +361,7 @@ define void @simplified_cast_preserves_irflag_type(ptr noalias %p, ptr noalias %
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i16
; CHECK-NEXT: store i16 [[TMP2]], ptr [[Q]], align 2
; CHECK-NEXT: store i16 [[TMP2]], ptr [[R]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index b72cbd333cb79..7b90b92db7eae 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -115,16 +115,16 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize {
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
+; CHECK-NEXT: EMIT-SCALAR vp<[[RECUR_NEXT:%.+]]> = sext ir<%y> to i32
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next>
+; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
-; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%z>
; CHECK-NEXT: EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond>
; CHECK-NEXT: Successor(s): pred.store
@@ -138,7 +138,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize {
; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
-; CHECK-NEXT: REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next>
+; CHECK-NEXT: REPLICATE ir<%add> = add ir<%rem>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep>
; CHECK-NEXT: Successor(s): pred.store.continue
; CHECK-EMPTY:
@@ -199,19 +199,19 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<1234>, ir<-1>, ir<1>
-; CHECK-NEXT: WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
+; CHECK-NEXT: EMIT-SCALAR vp<[[RECUR_NEXT:%.+]]> = sext ir<%y> to i32
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next>
+; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%and.red> = phi vp<[[RDX_START]]>, ir<%and.red.next>
; CHECK-NEXT: EMIT vp<[[WIDEN_CAN:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]>
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_CAN]]>, vp<[[BTC]]>
-; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
-; CHECK-NEXT: WIDEN ir<%add> = add ir<%rem>, ir<%recur.next>
+; CHECK-NEXT: WIDEN ir<%add> = add ir<%rem>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: WIDEN ir<%and.red.next> = and ir<%and.red>, ir<%add>
; CHECK-NEXT: EMIT vp<[[SEL:%.+]]> = select vp<[[MASK]]>, ir<%and.red.next>, ir<%and.red>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -384,16 +384,16 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
+; CHECK-NEXT: EMIT-SCALAR vp<[[RECUR_NEXT:%.+]]> = sext ir<%y> to i32
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next>
+; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
-; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, vp<[[RECUR_NEXT]]>
; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 6e7852ec347dd..bd29398331c61 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -236,10 +236,7 @@ define void @narrow_scatter_with_uniform_addr_to_scalar(ptr noalias %src, ptr no
; VF4IC1: [[VECTOR_BODY]]:
; VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF4IC1-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2
-; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-; VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; VF4IC1-NEXT: [[TMP1:%.*]] = trunc <4 x i16> [[BROADCAST_SPLAT]] to <4 x i8>
-; VF4IC1-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; VF4IC1-NEXT: [[TMP2:%.*]] = trunc i16 [[TMP0]] to i8
; VF4IC1-NEXT: store i8 [[TMP2]], ptr [[DST2]], align 1
; VF4IC1-NEXT: store i8 [[TMP2]], ptr [[DST]], align 1
; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -259,10 +256,7 @@ define void @narrow_scatter_with_uniform_addr_to_scalar(ptr noalias %src, ptr no
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2IC2-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2
-; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-; VF2IC2-NEXT: [[TMP1:%.*]] = trunc <2 x i16> [[BROADCAST_SPLAT]] to <2 x i8>
-; VF2IC2-NEXT: [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
+; VF2IC2-NEXT: [[TMP2:%.*]] = trunc i16 [[TMP0]] to i8
; VF2IC2-NEXT: store i8 [[TMP2]], ptr [[DST2]], align 1
; VF2IC2-NEXT: store i8 [[TMP2]], ptr [[DST]], align 1
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -358,16 +352,10 @@ define void @narrow_scatter_with_uniform_addr_to_scalar_unroll(ptr noalias %src,
; VF2IC2-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]]
; VF2IC2-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP2]]
; VF2IC2-NEXT: [[TMP5:%.*]] = load i16, ptr [[TMP9]], align 4
-; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i64 0
-; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
; VF2IC2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP10]], align 4
-; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP13]], i64 0
-; VF2IC2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT1]], <2 x i16> poison, <2 x i32> zeroinitializer
-; VF2IC2-NEXT: [[TMP6:%.*]] = trunc <2 x i16> [[BROADCAST_SPLAT1]] to <2 x i8>
-; VF2IC2-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
+; VF2IC2-NEXT: [[TMP7:%.*]] = trunc i16 [[TMP13]] to i8
; VF2IC2-NEXT: store i8 [[TMP7]], ptr [[DST2]], align 4
-; VF2IC2-NEXT: [[TMP11:%.*]] = trunc <2 x i16> [[BROADCAST_SPLAT]] to <2 x i8>
-; VF2IC2-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
+; VF2IC2-NEXT: [[TMP12:%.*]] = trunc i16 [[TMP5]] to i8
; VF2IC2-NEXT: store i8 [[TMP12]], ptr [[TMP3]], align 4
; VF2IC2-NEXT: store i8 [[TMP7]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
index c94e42ddb718a..80ad3172e0bae 100644
--- a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
@@ -14,11 +14,14 @@ define void @loop_invariant_store(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE8:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE10:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE10]] ]
; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 8)
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP3]], <4 x i32> [[TMP2]]
@@ -32,26 +35,26 @@ define void @loop_invariant_store(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i8> [[TMP8]], i32 1
; CHECK-NEXT: store i8 [[TMP12]], ptr [[P]], align 1
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
-; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
-; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP8]], i32 2
; CHECK-NEXT: store i8 [[TMP14]], ptr [[P]], align 1
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
-; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8]]
-; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_IF9]]:
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3
; CHECK-NEXT: store i8 [[TMP9]], ptr [[P]], align 1
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
-; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
@@ -102,17 +105,20 @@ define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE10:.*]] ]
-; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE10]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[INDEX:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IND:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IND:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 8)
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i8> [[VEC_IND1]], splat (i8 2)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP3]], <4 x i32> [[TMP2]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[BROADCAST_SPLAT4]], <4 x i32> [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8)
; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
; CHECK-NEXT: [[TMP11:%.*]] = srem <4 x i8> [[VEC_IND1]], [[TMP8]]
@@ -125,30 +131,30 @@ define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
-; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i8> [[TMP11]], i32 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP16]]
; CHECK-NEXT: store i32 4, ptr [[TMP15]], align 4
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
-; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i8> [[TMP11]], i32 2
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP20]]
; CHECK-NEXT: store i32 4, ptr [[TMP19]], align 4
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
-; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10]]
-; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_IF11]]:
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i8> [[TMP11]], i32 3
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP21]]
; CHECK-NEXT: store i32 4, ptr [[TMP23]], align 4
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
-; CHECK: [[PRED_STORE_CONTINUE10]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX1]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND1]], splat (i8 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
index ae96fe58caa26..9d32ff563f557 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
@@ -13,9 +13,9 @@ define void @trunc_minimal_bitwidth(ptr %bptr, ptr noalias %hptr, i32 %val, i64
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[VAL:%.*]] to i16
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP6]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
index 3f3caa9d64cbc..0bdd96b0c0a61 100644
--- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -12,14 +12,11 @@ define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BOOL2]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[BOOL1_EXT]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
-; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[BOOL2]] to i8
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = xor i8 [[TMP2]], [[TMP5]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
index 6e7cdba1cd3ce..07a211053cb4e 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
@@ -11,7 +11,10 @@ define void @pr77468(ptr noalias %src, ptr noalias %dst, i1 %x) {
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[X]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i16>
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP0]] to i16
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 8e160e57b7703..7ccd3221bff32 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -1099,40 +1099,40 @@ define i64 @print_ext_mul_two_uses(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1>
-; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext ir<%b> to i32
-; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%conv>, ir<%conv>
+; CHECK-NEXT: EMIT-SCALAR vp<[[CONV:%.+]]> = sext ir<%b> to i32
+; CHECK-NEXT: WIDEN ir<%mul> = mul vp<[[CONV]]>, vp<[[CONV]]>
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%res2> = phi vp<%3>, vp<%5>
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%res2> = phi vp<%3>, vp<[[RED:%.+]]>
; CHECK-NEXT: CLONE ir<%load> = load ir<%a>
-; CHECK-NEXT: WIDEN-CAST ir<%load.ext> = sext ir<%load> to i32
-; CHECK-NEXT: WIDEN-CAST ir<%load.ext.ext> = sext ir<%load.ext> to i64
-; CHECK-NEXT: EXPRESSION vp<%5> = ir<%res2> + reduce.add (ir<%mul> zext to i64)
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%4>, vp<%0>
+; CHECK-NEXT: EMIT-SCALAR vp<[[LOAD_EXT:%.+]]> = sext ir<%load> to i32
+; CHECK-NEXT: EMIT-SCALAR vp<[[LOAD_EXT_EXT:%.+]]> = sext vp<[[LOAD_EXT]]> to i64
+; CHECK-NEXT: EXPRESSION vp<[[RED]]> = ir<%res2> + reduce.add (ir<%mul> zext to i64)
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[IV]]>, vp<%0>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%1>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<%7> = compute-reduction-result ir<%res2>, vp<%5>
-; CHECK-NEXT: EMIT vp<[[EXT_PART:%.+]]> = extract-last-part ir<%load.ext.ext>
+ ; CHECK-NEXT: EMIT vp<[[RES:%.+]]> = compute-reduction-result ir<%res2>, vp<[[RED]]>
+; CHECK-NEXT: EMIT vp<[[EXT_PART:%.+]]> = extract-last-part vp<[[LOAD_EXT_EXT]]>
; CHECK-NEXT: EMIT vp<%vector.recur.extract> = extract-last-lane vp<[[EXT_PART]]>
; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%2>, vp<%1>
; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT: IR %add.lcssa = phi i64 [ %add, %loop ] (extra operand: vp<%7> from middle.block)
+; CHECK-NEXT: IR %add.lcssa = phi i64 [ %add, %loop ] (extra operand: vp<[[RES]]> from middle.block)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<0>, ir-bb<entry> ]
; CHECK-NEXT: EMIT-SCALAR vp<%scalar.recur.init> = phi [ vp<%vector.recur.extract>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%7>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RES]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
; CHECK-NEXT: Successor(s): ir-bb<loop>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<loop>:
>From 2ba362b94f39d66cfb2cc3d0f207f1637c30f9aa Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 6 Nov 2025 00:58:15 -0800
Subject: [PATCH 4/6] No need to check PreservesUniformity for cast
---
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 56f7d2114bab0..9006238afa5db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -284,16 +284,12 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
all_of(Rep->operands(), isSingleScalar));
}
if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
- VPWidenSelectRecipe>(VPV))
+ VPWidenSelectRecipe, VPWidenCastRecipe>(VPV))
return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
return preservesUniformity(WidenR->getOpcode()) &&
all_of(WidenR->operands(), isSingleScalar);
}
- if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
- return preservesUniformity(CastR->getOpcode()) &&
- all_of(CastR->operands(), isSingleScalar);
- }
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
(preservesUniformity(VPI->getOpcode()) &&
>From d37f3663ef525edd2ecef4871c4e791546380998 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 6 Nov 2025 01:21:24 -0800
Subject: [PATCH 5/6] Add metadata and flags
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7ee2e218a8926..f84d90eca54ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1677,7 +1677,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
VPBuilder Builder(CastR);
auto *Clone = Builder.createScalarCast(
CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
- CastR->getDebugLoc());
+ CastR->getDebugLoc(), *CastR, *CastR);
CastR->replaceAllUsesWith(Clone);
CastR->eraseFromParent();
continue;
>From 4708f75ff52c34c580fb4afe38a7b85a49526ae8 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 30 Dec 2025 01:29:15 -0800
Subject: [PATCH 6/6] use common replaceAllUsesWith
---
.../lib/Transforms/Vectorize/VPlanTransforms.cpp | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f84d90eca54ce..c757362f9afda 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1673,19 +1673,17 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
}))
continue;
+ VPSingleDefRecipe *Clone;
if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) {
- VPBuilder Builder(CastR);
- auto *Clone = Builder.createScalarCast(
+ Clone = VPBuilder(CastR).createScalarCast(
CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
CastR->getDebugLoc(), *CastR, *CastR);
- CastR->replaceAllUsesWith(Clone);
- CastR->eraseFromParent();
- continue;
+ } else {
+ Clone = new VPReplicateRecipe(
+ RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+ true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
+ Clone->insertBefore(RepOrWidenR);
}
- auto *Clone = new VPReplicateRecipe(
- RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
- true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
- Clone->insertBefore(RepOrWidenR);
RepOrWidenR->replaceAllUsesWith(Clone);
if (isDeadRecipe(*RepOrWidenR))
RepOrWidenR->eraseFromParent();
More information about the llvm-commits
mailing list