[llvm] b76089c - [VPlan] Skip uses-scalars restriction if one of ops needs broadcast. (#168246)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 28 02:26:31 PST 2025
Author: Florian Hahn
Date: 2025-11-28T10:26:27Z
New Revision: b76089c7f3d6593d2e2c83db7dbf4965b656bd8c
URL: https://github.com/llvm/llvm-project/commit/b76089c7f3d6593d2e2c83db7dbf4965b656bd8c
DIFF: https://github.com/llvm/llvm-project/commit/b76089c7f3d6593d2e2c83db7dbf4965b656bd8c.diff
LOG: [VPlan] Skip uses-scalars restriction if one of ops needs broadcast. (#168246)
Update the logic in narrowToSingleScalar to allow narrowing even if not
all users use scalars, if at least one of the operands already needs
broadcasting.
In that case, there won't be any additional broadcasts introduced. This
should allow removing the special handling for stores, which can
introduce additional broadcasts currently.
Fixes https://github.com/llvm/llvm-project/issues/169668.
PR: https://github.com/llvm/llvm-project/pull/168246
Added:
llvm/test/Transforms/LoopVectorize/AArch64/widen-gep-all-indices-invariant.ll
Modified:
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
llvm/test/Transforms/LoopVectorize/cse-casts.ll
llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 67aca48bcaf8f..b12f8ccc73c7e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1522,32 +1522,36 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
continue;
}
- // Skip recipes that aren't single scalars or don't have only their
- // scalar results used. In the latter case, we would introduce extra
- // broadcasts.
- if (!vputils::isSingleScalar(RepOrWidenR) ||
- !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
- if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
- // VPWidenStore doesn't have users, and stores are always
- // profitable to widen: hence, permitting address and mask
- // operands, and single-scalar stored values is an important leaf
- // condition. The assert must hold as we checked the RepOrWidenR
- // operand against vputils::isSingleScalar.
- assert(RepOrWidenR != Store->getStoredValue() ||
- vputils::isSingleScalar(Store->getStoredValue()));
- (void)Store;
- return true;
- }
-
- if (auto *VPI = dyn_cast<VPInstruction>(U)) {
- unsigned Opcode = VPI->getOpcode();
- if (Opcode == VPInstruction::ExtractLastElement ||
- Opcode == VPInstruction::ExtractLastLanePerPart ||
- Opcode == VPInstruction::ExtractPenultimateElement)
- return true;
- }
-
- return U->usesScalars(RepOrWidenR);
+ // Skip recipes that aren't single scalars.
+ if (!vputils::isSingleScalar(RepOrWidenR))
+ continue;
+
+ // Skip recipes for which conversion to single-scalar does introduce
+ // additional broadcasts. No extra broadcasts are needed, if either only
+ // the scalars of the recipe are used, or at least one of the operands
+ // would require a broadcast. In the latter case, the single-scalar may
+ // need to be broadcasted, but another broadcast is removed.
+ if (!all_of(RepOrWidenR->users(),
+ [RepOrWidenR](const VPUser *U) {
+ if (auto *VPI = dyn_cast<VPInstruction>(U)) {
+ unsigned Opcode = VPI->getOpcode();
+ if (Opcode == VPInstruction::ExtractLastElement ||
+ Opcode == VPInstruction::ExtractLastLanePerPart ||
+ Opcode == VPInstruction::ExtractPenultimateElement)
+ return true;
+ }
+
+ return U->usesScalars(RepOrWidenR);
+ }) &&
+ none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
+ if (Op->getSingleUser() != RepOrWidenR)
+ return false;
+ // Non-constant live-ins require broadcasts, while constants do not
+ // need explicit broadcasts.
+ bool LiveInNeedsBroadcast =
+ Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
+ auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
+ return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
}))
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 118f165050602..b549a06f08f8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -540,10 +540,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 8
; DEFAULT-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]]
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP1]], i64 0
+; DEFAULT-NEXT: [[TMP8:%.*]] = or i16 [[TMP1]], 1
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP8]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP8:%.*]] = or <vscale x 4 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
-; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp <vscale x 4 x i16> [[TMP8]] to <vscale x 4 x double>
+; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp <vscale x 4 x i16> [[BROADCAST_SPLAT]] to <vscale x 4 x double>
; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2
; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP11]]
@@ -585,10 +585,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP12]], i64 0
+; PRED-NEXT: [[TMP11:%.*]] = or i16 [[TMP12]], 1
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP11]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-; PRED-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
-; PRED-NEXT: [[TMP13:%.*]] = uitofp <vscale x 4 x i16> [[TMP11]] to <vscale x 4 x double>
+; PRED-NEXT: [[TMP13:%.*]] = uitofp <vscale x 4 x i16> [[BROADCAST_SPLAT]] to <vscale x 4 x double>
; PRED-NEXT: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[TMP13]], ptr align 8 [[NEXT_GEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
@@ -694,10 +694,10 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC_2]], align 4
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP16]], i64 0
+; DEFAULT-NEXT: [[TMP17:%.*]] = fmul float [[TMP16]], 0.000000e+00
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP17]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP17:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
-; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP17]])
+; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[BROADCAST_SPLAT]])
; DEFAULT-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_3]], align 4
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
@@ -879,10 +879,10 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; PRED-NEXT: [[TMP20:%.*]] = fmul float [[TMP19]], 0.000000e+00
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP20]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
-; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
-; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
+; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[BROADCAST_SPLAT]])
; PRED-NEXT: [[TMP22:%.*]] = load float, ptr [[SRC_3]], align 4
; PRED-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP22]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 44ae1757ce6e6..f2c0ca30a6c18 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -59,8 +59,6 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; VSCALEFORTUNING2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; VSCALEFORTUNING2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
; VSCALEFORTUNING2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
-; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; VSCALEFORTUNING2-NEXT: [[TMP7:%.*]] = add i64 [[Y]], 1
@@ -68,7 +66,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; VSCALEFORTUNING2-NEXT: [[TMP9:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i32> [[TMP9]], [[TMP10]]
-; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]]
+; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = or i32 [[Z]], [[X]]
+; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
+; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP13]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64>
@@ -180,8 +180,6 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED: [[VECTOR_PH]]:
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
@@ -195,7 +193,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[TMP13:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; PRED-NEXT: [[TMP14:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; PRED-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; PRED-NEXT: [[TMP16:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]]
+; PRED-NEXT: [[TMP20:%.*]] = or i32 [[Z]], [[X]]
+; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
+; PRED-NEXT: [[TMP16:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; PRED-NEXT: [[TMP17:%.*]] = and <vscale x 4 x i32> [[TMP16]], splat (i32 1)
; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP17]], splat (i32 1)
; PRED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-gep-all-indices-invariant.ll
new file mode 100644
index 0000000000000..97cc6929e44d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-gep-all-indices-invariant.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macosx -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/169668.
+define i32 @gep_with_all_invariant_operands(ptr %src.0, ptr %src.1, i64 %n, i1 %cond) #0 {
+; CHECK-LABEL: define i32 @gep_with_all_invariant_operands(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]], i1 [[COND:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[N]]
+; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[COND]], ptr [[SRC_1]], ptr [[TMP8]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP9]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP7]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = xor i1 [[TMP10]], true
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP12:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true)
+; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP12]], i1 false)
+; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], i64 [[LAST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret i32 [[TMP16]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+ %gep = getelementptr i32, ptr %src.0, i64 %n
+ %ptr = select i1 %cond, ptr %src.1, ptr %gep
+ %val = load i32, ptr %ptr, align 4
+ %iv.next = add i64 %iv, 1
+ %cmp = icmp ult i64 %iv, %n
+ br i1 %cmp, label %loop, label %exit, !llvm.loop !0
+
+exit:
+ ret i32 %val
+}
+
+attributes #0 = { "target-cpu"="neoverse-v2" }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 8d4d282a5236d..0723f16677090 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -9,10 +9,10 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = shl <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
-; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[TMP5]], splat (i64 52)
+; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 6ec010cdcc248..651e2ad5e74da 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -686,16 +686,256 @@ exit:
; Test for https://github.com/llvm/llvm-project/issues/129236.
define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
-; CHECK-NEXT: entry:
+; CHECK-NEXT: iter.check:
; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0
; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32
; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i1> poison, i1 [[CMP_I]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i1> [[BROADCAST_SPLATINSERT]], <32 x i1> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP60:%.*]] = xor <32 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE62:%.*]] ]
+; CHECK-NEXT: [[TMP61:%.*]] = extractelement <32 x i1> [[TMP60]], i32 0
+; CHECK-NEXT: br i1 [[TMP61]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
+; CHECK: pred.urem.if:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]]
+; CHECK: pred.urem.continue:
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x i1> [[TMP60]], i32 1
+; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_UREM_IF1:%.*]], label [[PRED_UREM_CONTINUE2:%.*]]
+; CHECK: pred.urem.if1:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE2]]
+; CHECK: pred.urem.continue2:
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <32 x i1> [[TMP60]], i32 2
+; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]]
+; CHECK: pred.urem.if3:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE4]]
+; CHECK: pred.urem.continue4:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <32 x i1> [[TMP60]], i32 3
+; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]]
+; CHECK: pred.urem.if5:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE6]]
+; CHECK: pred.urem.continue6:
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <32 x i1> [[TMP60]], i32 4
+; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]]
+; CHECK: pred.urem.if7:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE8]]
+; CHECK: pred.urem.continue8:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP60]], i32 5
+; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]]
+; CHECK: pred.urem.if9:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE10]]
+; CHECK: pred.urem.continue10:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <32 x i1> [[TMP60]], i32 6
+; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]]
+; CHECK: pred.urem.if11:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE12]]
+; CHECK: pred.urem.continue12:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP60]], i32 7
+; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]]
+; CHECK: pred.urem.if13:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE14]]
+; CHECK: pred.urem.continue14:
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <32 x i1> [[TMP60]], i32 8
+; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16:%.*]]
+; CHECK: pred.urem.if15:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE16]]
+; CHECK: pred.urem.continue16:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i1> [[TMP60]], i32 9
+; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UREM_IF17:%.*]], label [[PRED_UREM_CONTINUE18:%.*]]
+; CHECK: pred.urem.if17:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE18]]
+; CHECK: pred.urem.continue18:
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <32 x i1> [[TMP60]], i32 10
+; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF19:%.*]], label [[PRED_UREM_CONTINUE20:%.*]]
+; CHECK: pred.urem.if19:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE20]]
+; CHECK: pred.urem.continue20:
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i1> [[TMP60]], i32 11
+; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_UREM_IF21:%.*]], label [[PRED_UREM_CONTINUE22:%.*]]
+; CHECK: pred.urem.if21:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE22]]
+; CHECK: pred.urem.continue22:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i1> [[TMP60]], i32 12
+; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_UREM_IF23:%.*]], label [[PRED_UREM_CONTINUE24:%.*]]
+; CHECK: pred.urem.if23:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE24]]
+; CHECK: pred.urem.continue24:
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <32 x i1> [[TMP60]], i32 13
+; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF25:%.*]], label [[PRED_UREM_CONTINUE26:%.*]]
+; CHECK: pred.urem.if25:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE26]]
+; CHECK: pred.urem.continue26:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <32 x i1> [[TMP60]], i32 14
+; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF27:%.*]], label [[PRED_UREM_CONTINUE28:%.*]]
+; CHECK: pred.urem.if27:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE28]]
+; CHECK: pred.urem.continue28:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <32 x i1> [[TMP60]], i32 15
+; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF29:%.*]], label [[PRED_UREM_CONTINUE30:%.*]]
+; CHECK: pred.urem.if29:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE30]]
+; CHECK: pred.urem.continue30:
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <32 x i1> [[TMP60]], i32 16
+; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_UREM_IF31:%.*]], label [[PRED_UREM_CONTINUE32:%.*]]
+; CHECK: pred.urem.if31:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE32]]
+; CHECK: pred.urem.continue32:
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i1> [[TMP60]], i32 17
+; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_UREM_IF33:%.*]], label [[PRED_UREM_CONTINUE34:%.*]]
+; CHECK: pred.urem.if33:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE34]]
+; CHECK: pred.urem.continue34:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <32 x i1> [[TMP60]], i32 18
+; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_UREM_IF35:%.*]], label [[PRED_UREM_CONTINUE36:%.*]]
+; CHECK: pred.urem.if35:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE36]]
+; CHECK: pred.urem.continue36:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <32 x i1> [[TMP60]], i32 19
+; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_UREM_IF37:%.*]], label [[PRED_UREM_CONTINUE38:%.*]]
+; CHECK: pred.urem.if37:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE38]]
+; CHECK: pred.urem.continue38:
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i1> [[TMP60]], i32 20
+; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF39:%.*]], label [[PRED_UREM_CONTINUE40:%.*]]
+; CHECK: pred.urem.if39:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE40]]
+; CHECK: pred.urem.continue40:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i1> [[TMP60]], i32 21
+; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_UREM_IF41:%.*]], label [[PRED_UREM_CONTINUE42:%.*]]
+; CHECK: pred.urem.if41:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE42]]
+; CHECK: pred.urem.continue42:
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <32 x i1> [[TMP60]], i32 22
+; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_UREM_IF43:%.*]], label [[PRED_UREM_CONTINUE44:%.*]]
+; CHECK: pred.urem.if43:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE44]]
+; CHECK: pred.urem.continue44:
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <32 x i1> [[TMP60]], i32 23
+; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_UREM_IF45:%.*]], label [[PRED_UREM_CONTINUE46:%.*]]
+; CHECK: pred.urem.if45:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE46]]
+; CHECK: pred.urem.continue46:
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <32 x i1> [[TMP60]], i32 24
+; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_UREM_IF47:%.*]], label [[PRED_UREM_CONTINUE48:%.*]]
+; CHECK: pred.urem.if47:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE48]]
+; CHECK: pred.urem.continue48:
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <32 x i1> [[TMP60]], i32 25
+; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_UREM_IF49:%.*]], label [[PRED_UREM_CONTINUE50:%.*]]
+; CHECK: pred.urem.if49:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE50]]
+; CHECK: pred.urem.continue50:
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <32 x i1> [[TMP60]], i32 26
+; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_UREM_IF51:%.*]], label [[PRED_UREM_CONTINUE52:%.*]]
+; CHECK: pred.urem.if51:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE52]]
+; CHECK: pred.urem.continue52:
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <32 x i1> [[TMP60]], i32 27
+; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_UREM_IF53:%.*]], label [[PRED_UREM_CONTINUE54:%.*]]
+; CHECK: pred.urem.if53:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE54]]
+; CHECK: pred.urem.continue54:
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <32 x i1> [[TMP60]], i32 28
+; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_UREM_IF55:%.*]], label [[PRED_UREM_CONTINUE56:%.*]]
+; CHECK: pred.urem.if55:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE56]]
+; CHECK: pred.urem.continue56:
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <32 x i1> [[TMP60]], i32 29
+; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_UREM_IF57:%.*]], label [[PRED_UREM_CONTINUE58:%.*]]
+; CHECK: pred.urem.if57:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE58]]
+; CHECK: pred.urem.continue58:
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <32 x i1> [[TMP60]], i32 30
+; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_UREM_IF59:%.*]], label [[PRED_UREM_CONTINUE60:%.*]]
+; CHECK: pred.urem.if59:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE60]]
+; CHECK: pred.urem.continue60:
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <32 x i1> [[TMP60]], i32 31
+; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_UREM_IF61:%.*]], label [[PRED_UREM_CONTINUE62]]
+; CHECK: pred.urem.if61:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE62]]
+; CHECK: pred.urem.continue62:
+; CHECK-NEXT: [[TMP33:%.*]] = select <32 x i1> [[TMP60]], <32 x i1> poison, <32 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP34:%.*]] = or <32 x i1> [[TMP33]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[CMP_I]], <32 x i32> zeroinitializer, <32 x i32> poison
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <32 x i32> [[PREDPHI]], i32 0
+; CHECK-NEXT: [[TMP36:%.*]] = ashr i32 [[CONV5_I]], [[TMP35]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <32 x i32> poison, i32 [[TMP36]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT64:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT63]], <32 x i32> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP37:%.*]] = icmp eq <32 x i32> [[BROADCAST_SPLAT64]], zeroinitializer
+; CHECK-NEXT: [[TMP38:%.*]] = shl <32 x i32> [[PREDPHI]], splat (i32 24)
+; CHECK-NEXT: [[TMP39:%.*]] = ashr exact <32 x i32> [[TMP38]], splat (i32 24)
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <32 x i1> [[TMP37]], i32 0
+; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], <32 x i32> [[TMP39]], <32 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI65:%.*]] = select <32 x i1> [[TMP34]], <32 x i32> [[TMP41]], <32 x i32> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
+; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <32 x i32> [[PREDPHI65]], i32 31
+; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK: vec.epilog.iter.check:
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK: vec.epilog.ph:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT66:%.*]] = insertelement <4 x i1> poison, i1 [[CMP_I]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT67:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT66]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT67]], splat (i1 true)
+; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK: vec.epilog.vector.body:
+; CHECK-NEXT: [[INDEX68:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT81:%.*]], [[PRED_UREM_CONTINUE76:%.*]] ]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP44]], i32 0
+; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_UREM_IF69:%.*]], label [[PRED_UREM_CONTINUE70:%.*]]
+; CHECK: pred.urem.if69:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE70]]
+; CHECK: pred.urem.continue70:
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i1> [[TMP44]], i32 1
+; CHECK-NEXT: br i1 [[TMP46]], label [[PRED_UREM_IF71:%.*]], label [[PRED_UREM_CONTINUE72:%.*]]
+; CHECK: pred.urem.if71:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE72]]
+; CHECK: pred.urem.continue72:
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[TMP44]], i32 2
+; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_UREM_IF73:%.*]], label [[PRED_UREM_CONTINUE74:%.*]]
+; CHECK: pred.urem.if73:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE74]]
+; CHECK: pred.urem.continue74:
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP44]], i32 3
+; CHECK-NEXT: br i1 [[TMP48]], label [[PRED_UREM_IF75:%.*]], label [[PRED_UREM_CONTINUE76]]
+; CHECK: pred.urem.if75:
+; CHECK-NEXT: br label [[PRED_UREM_CONTINUE76]]
+; CHECK: pred.urem.continue76:
+; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP50:%.*]] = or <4 x i1> [[TMP49]], [[BROADCAST_SPLAT67]]
+; CHECK-NEXT: [[PREDPHI77:%.*]] = select i1 [[CMP_I]], <4 x i32> zeroinitializer, <4 x i32> poison
+; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[PREDPHI77]], i32 0
+; CHECK-NEXT: [[TMP52:%.*]] = ashr i32 [[CONV5_I]], [[TMP51]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT78:%.*]] = insertelement <4 x i32> poison, i32 [[TMP52]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT79:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT78]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP53:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT79]], zeroinitializer
+; CHECK-NEXT: [[TMP54:%.*]] = shl <4 x i32> [[PREDPHI77]], splat (i32 24)
+; CHECK-NEXT: [[TMP55:%.*]] = ashr exact <4 x i32> [[TMP54]], splat (i32 24)
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP53]], i32 0
+; CHECK-NEXT: [[TMP57:%.*]] = select i1 [[TMP56]], <4 x i32> [[TMP55]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI80:%.*]] = select <4 x i1> [[TMP50]], <4 x i32> [[TMP57]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT81]] = add nuw i32 [[INDEX68]], 4
+; CHECK-NEXT: [[TMP58:%.*]] = icmp eq i32 [[INDEX_NEXT81]], 100
+; CHECK-NEXT: br i1 [[TMP58]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: vec.epilog.middle.block:
+; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i32> [[PREDPHI80]], i32 3
+; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK: vec.epilog.scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 4, [[VEC_EPILOG_ITER_CHECK]] ], [ 100, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]]
; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
; CHECK: then:
-; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER1]] ]
; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
@@ -710,9 +950,9 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: exit:
-; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
+; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ [[TMP59]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[P_2_LCSSA]]
;
entry:
@@ -838,7 +1078,7 @@ define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 {
; CHECK-NEXT: store <8 x i32> [[PREDPHI]], ptr [[TMP42]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK: scalar.ph:
@@ -858,7 +1098,7 @@ define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 {
; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 16
-; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
@@ -1197,12 +1437,12 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32)
; CHECK-NEXT: [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i32 31
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF13:![0-9]+]]
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0
@@ -1293,7 +1533,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
; CHECK-NEXT: [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8
; CHECK-NEXT: [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8)
; CHECK-NEXT: [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000
-; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i32 7
; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -1312,7 +1552,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
; CHECK-NEXT: [[MERGE:%.*]] = phi i64 [ [[ZEXT]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i64 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP164]], [[MIDDLE_BLOCK]] ], [ [[TMP209]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[MERGE_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index c70a3aa249919..801f910c5e13d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -376,10 +376,10 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 4, i64 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP20]]
; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META6:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP21]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[TMP23:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[TMP31]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
@@ -739,10 +739,10 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[Y:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[X:%.*]], i64 0
+; CHECK-NEXT: [[X:%.*]] = xor i1 [[X1:%.*]], true
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[X]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i1> [[BROADCAST_SPLAT]] to <2 x i64>
; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[BROADCAST_SPLAT2]], [[TMP1]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index f44d3008cbaa5..3813560d9300a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -587,10 +587,10 @@ define double @test_load_used_by_other_load_scev_low_trip_count(ptr %ptr.a, ptr
; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
; I64-NEXT: [[ADD1:%.*]] = fadd double [[TMP10]], 0.000000e+00
-; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
-; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
+; I64-NEXT: [[GEP_C_OFFSET:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
+; I64-NEXT: [[LOAD_C:%.*]] = load double, ptr [[GEP_C_OFFSET]], align 8
; I64-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
-; I64-NEXT: [[MUL2:%.*]] = fmul double [[TMP15]], 0.000000e+00
+; I64-NEXT: [[MUL2:%.*]] = fmul double [[LOAD_C]], 0.000000e+00
; I64-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
; I64-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/cse-casts.ll b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
index 4737a56df2735..b6d7a9f81ec9d 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
@@ -14,9 +14,9 @@ define i8 @preserve_flags_when_cloning_trunc(i8 %start, ptr noalias %src, ptr no
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i8> [ splat (i8 1), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i64 4
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 9269558bd5f65..5cb8bd911df75 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -180,7 +180,7 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: WIDEN ir<%for.x.next> = mul ir<%x>, ir<2>
+; CHECK-NEXT: CLONE ir<%for.x.next> = mul ir<%x>, ir<2>
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 5385a83dfac65..1736b66511bfb 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -161,18 +161,16 @@ define void @narrow_widen_store_user(i32 %x, ptr noalias %A, ptr noalias %B) {
; VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
; VF4IC1: [[VECTOR_PH]]:
; VF4IC1-NEXT: [[TMP0:%.*]] = add i32 [[X]], 1
-; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; VF4IC1-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], 3
-; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
+; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
; VF4IC1-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; VF4IC1-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], splat (i32 3)
; VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
; VF4IC1: [[VECTOR_BODY]]:
; VF4IC1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i32 [[INDEX]]
; VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]]
-; VF4IC1-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 4
-; VF4IC1-NEXT: store <4 x i32> [[TMP1]], ptr [[TMP3]], align 4
+; VF4IC1-NEXT: store <4 x i32> [[TMP1]], ptr [[TMP2]], align 4
+; VF4IC1-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP3]], align 4
; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
; VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -187,22 +185,20 @@ define void @narrow_widen_store_user(i32 %x, ptr noalias %A, ptr noalias %B) {
; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]]
; VF2IC2: [[VECTOR_PH]]:
; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[X]], 1
-; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
-; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; VF2IC2-NEXT: [[TMP7:%.*]] = mul i32 [[TMP0]], 3
-; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
+; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
; VF2IC2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT: [[TMP7:%.*]] = mul <2 x i32> [[TMP1]], splat (i32 3)
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i32 [[INDEX]]
; VF2IC2-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]]
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP2]], i64 2
-; VF2IC2-NEXT: store <2 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 4
-; VF2IC2-NEXT: store <2 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP2]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i64 2
-; VF2IC2-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP3]], align 4
-; VF2IC2-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP5]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP3]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP5]], align 4
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
; VF2IC2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
index 71e6879b017da..70a808f9b336e 100644
--- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
@@ -88,10 +88,7 @@ define i64 @test2(i64 %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[Y:%.*]], 0
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i64> splat (i64 77), <4 x i64> splat (i64 55)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: vector.body:
@@ -133,12 +130,9 @@ define i32 @test3(i64 %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[Y:%.*]], 0
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> splat (i32 55)
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index f17866718a0ee..8d14299a02878 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -6,14 +6,14 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
; CHECK-NEXT: bb:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
-; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT5]], splat (i32 1)
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[A:%.*]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP6]], <2 x i1> [[BROADCAST_SPLAT2]], <2 x i1> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
index 52555d550f3d9..c94e42ddb718a 100644
--- a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
@@ -9,10 +9,10 @@ define void @loop_invariant_store(ptr %p, i64 %a, i8 %b) {
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
-; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> [[TMP0]], splat (i64 52)
+; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
@@ -97,10 +97,10 @@ define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) {
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
-; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> [[TMP0]], splat (i64 52)
+; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index 183462f71d480..b55c563162e13 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -371,8 +371,8 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i64 [[N]], -1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0
-; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT7]], splat (i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[COND:%.*]], 7
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> poison, i1 [[TMP5]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
index 498c58d1bfd82..d34b14bf7be71 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
@@ -9,10 +9,10 @@ define void @test_pr47927_lshr_const_shift_ops(ptr %dst, i32 %f) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[F]], 18
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[BROADCAST_SPLAT]], splat (i32 18)
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i8>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i8>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -53,10 +53,10 @@ define void @test_shl_const_shift_ops(ptr %dst, i32 %f) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[F]], 18
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i32> [[BROADCAST_SPLAT]], splat (i32 18)
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i8>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i8>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -97,10 +97,10 @@ define void @test_ashr_const_shift_ops(ptr %dst, i32 %f) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[F]], 18
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = ashr <4 x i32> [[BROADCAST_SPLAT]], splat (i32 18)
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i8>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i8>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
More information about the llvm-commits
mailing list