[llvm] [LLVM][PhaseOrdering] Run CSE after InstCombine has cleaned the result of vectorisation. (PR #120443)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 12 05:01:37 PST 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/120443
>From 86b0bcc4dabeef98bfd542577d6d9bd8faa0845a Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 18 Dec 2024 13:17:13 +0000
Subject: [PATCH 1/2] Add test to show missing CSE.
---
.../post-vectorisation-combines-with-cse.ll | 92 +++++++++++++++++++
1 file changed, 92 insertions(+)
create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
new file mode 100644
index 0000000000000..dd0dee422058c
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='default<O2>' --prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local i32 @check(ptr noundef readonly captures(none) %mask, ptr noundef readonly captures(none) %result, i32 noundef %n) #0 {
+; CHECK-LABEL: define dso_local i32 @check(
+; CHECK-SAME: ptr noundef readonly captures(none) [[MASK:%.*]], ptr noundef readonly captures(none) [[RESULT:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP13]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[RESULT]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP15:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD3]], splat (float 2.000000e+00)
+; CHECK-NEXT: [[NOT_:%.*]] = xor <vscale x 4 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[NOT_]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[NARROW8:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI5:%.*]] = zext <vscale x 4 x i1> [[NARROW8]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[BIN_RDX]] = add <vscale x 4 x i32> [[VEC_PHI]], [[PREDPHI5]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: br i1 [[TMP10]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP17:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[COUNT_0_LCSSA]]
+;
+entry:
+ %cmp13 = icmp sgt i32 %n, 0
+ br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %count.1.lcssa = phi i32 [ %count.1, %for.inc ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %count.0.lcssa = phi i32 [ 0, %entry ], [ %count.1.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %count.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %count.014 = phi i32 [ 0, %for.body.preheader ], [ %count.1, %for.inc ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %mask, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ br i1 %cmp1, label %land.lhs.true, label %for.inc
+
+land.lhs.true: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds nuw float, ptr %result, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx4, align 4
+ %cmp6 = fcmp une float %1, 2.000000e+00
+ br i1 %cmp6, label %if.then, label %for.inc
+
+if.then: ; preds = %land.lhs.true
+ %inc = add nsw i32 %count.014, 1
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %land.lhs.true, %if.then
+ %count.1 = phi i32 [ %inc, %if.then ], [ %count.014, %land.lhs.true ], [ %count.014, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+v9a,+sve2" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
>From 01eaa30dd7191b1e4626214f625f53f07d18536f Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 18 Dec 2024 13:23:29 +0000
Subject: [PATCH 2/2] [LLVM][PhaseOrdering] Run CSE after InstCombine has
cleaned the result of vectorisation.
I revisited 58690 and was puzzled why the obvious combines do not
fire. I believe the reason is combines that end with replaceOperand,
because they can introduce duplicate expression trees (albeit tiny
ones) which in turn blocks combines that compare expression pointers.
Perhaps adding the extra CSE run is too big of a hammer but I figured
it exists as the first of the extra-vectorisation passes for a similar
reason and thus promoting it to the main vectorisation pipeline seems
reasonable? given the issues are now more prevalent as vectorisation
capabilities increase.
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 10 +++--
llvm/test/Other/opt-pipeline-vector-passes.ll | 1 -
.../LoopVectorize/X86/float-induction-x86.ll | 10 +----
...ting-sinking-required-for-vectorization.ll | 3 +-
.../AArch64/indvars-vectorization.ll | 8 +---
.../AArch64/interleavevectorization.ll | 32 +++++----------
.../post-vectorisation-combines-with-cse.ll | 39 +++++++++----------
.../PhaseOrdering/ARM/arm_mult_q15.ll | 12 ++----
8 files changed, 42 insertions(+), 73 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index adb005b5839f5..0e7af240b22c3 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1325,16 +1325,19 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
}
// Cleanup after the loop optimization passes.
FPM.addPass(InstCombinePass());
+ // InstCombine can create CSE opportunities when it cleans the result of loop
+ // vectorization. They occur when combines use replaceOperand, which happens
+ // most often when combining the boolean operations created by if-conversion.
+ ExtraFunctionPassManager<ShouldRunExtraVectorPasses> ExtraPasses;
+ ExtraPasses.addPass(EarlyCSEPass());
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
- ExtraFunctionPassManager<ShouldRunExtraVectorPasses> ExtraPasses;
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correlated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
- ExtraPasses.addPass(EarlyCSEPass());
ExtraPasses.addPass(CorrelatedValuePropagationPass());
ExtraPasses.addPass(InstCombinePass());
LoopPassManager LPM;
@@ -1348,9 +1351,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
ExtraPasses.addPass(
SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
ExtraPasses.addPass(InstCombinePass());
- FPM.addPass(std::move(ExtraPasses));
}
+ FPM.addPass(std::move(ExtraPasses));
+
// Now that we've formed fast to execute loop structures, we do further
// optimizations. These are run afterward as they might block doing complex
// analyses and transforms such as what are needed for loop vectorization.
diff --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll
index 83a9454c0d808..4d277c31d9445 100644
--- a/llvm/test/Other/opt-pipeline-vector-passes.ll
+++ b/llvm/test/Other/opt-pipeline-vector-passes.ll
@@ -19,7 +19,6 @@
; Everything runs at -O2.
; O2-LABEL: Running pass: LoopVectorizePass
-; O2-NOT: Running pass: EarlyCSEPass
; O2-NOT: Running pass: LICMPass
; O2: Running pass: SLPVectorizerPass
; O2: Running pass: VectorCombinePass
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index 537fda42d3a1e..3e9c1e2ecae79 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -54,9 +54,6 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]]
; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AUTO_VEC: vec.epilog.iter.check:
-; AUTO_VEC-NEXT: [[DOTCAST7:%.*]] = uitofp nneg i64 [[N_VEC]] to float
-; AUTO_VEC-NEXT: [[TMP6:%.*]] = fmul fast float [[DOTCAST7]], 5.000000e-01
-; AUTO_VEC-NEXT: [[IND_END8:%.*]] = fadd fast float [[TMP6]], 1.000000e+00
; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[ZEXT]], 28
; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
@@ -85,7 +82,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
; AUTO_VEC-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[FOR_BODY]]
; AUTO_VEC: for.body:
; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
; AUTO_VEC-NEXT: store float [[X_06]], ptr [[ARRAYIDX]], align 4
; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01
@@ -434,9 +431,6 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AUTO_VEC: vec.epilog.iter.check:
-; AUTO_VEC-NEXT: [[DOTCAST10:%.*]] = uitofp nneg i64 [[N_VEC]] to float
-; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul reassoc float [[DOTCAST10]], 4.200000e+01
-; AUTO_VEC-NEXT: [[IND_END11:%.*]] = fadd reassoc float [[TMP11]], 1.000000e+00
; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 28
; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
@@ -469,7 +463,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
; AUTO_VEC-NEXT: ret void
; AUTO_VEC: for.body:
; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[P]], i64 [[INDVARS_IV]]
; AUTO_VEC-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP16]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index f583a616dd375..097eeb01792fa 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -168,9 +168,8 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD11]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]
; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 16
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
-; CHECK-NEXT: store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
+; CHECK-NEXT: store <4 x float> [[PREDPHI12]], ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; CHECK-NEXT: br i1 [[TMP13]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index b056f44a6c469..81745b7b38289 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -31,13 +31,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
; CHECK: vector.memcheck:
; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP0]], 2
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP10]], i64 32000)
-; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i64 [[TMP10]], 32000
-; CHECK-NEXT: [[UMIN:%.*]] = zext i1 [[TMP11]] to i64
-; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP10]], [[UMIN]]
-; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[SMAX]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[UMIN]]
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[UMIN8]]
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP0]]
; CHECK-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 2
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 4
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
index be57809132d3f..59d3576c749f6 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
@@ -141,11 +141,9 @@ define void @addsubs(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) {
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 2
@@ -267,12 +265,10 @@ define void @add2sub2(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) {
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
; CHECK-NEXT: [[TMP3:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP4:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP3]], <32 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -393,9 +389,9 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT: [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT: [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP4]], [[WIDE_VEC36]]
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -553,14 +549,9 @@ define void @addsubsmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z,
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT: [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP8:%.*]] = add <32 x i16> [[TMP7]], [[WIDE_VEC36]]
-; CHECK-NEXT: [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP11:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP10]]
+; CHECK-NEXT: [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP2]]
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -718,14 +709,9 @@ define void @add2sub2mul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z,
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT: [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i16> [[TMP5]], [[WIDE_VEC36]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP8:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT: [[TMP11:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT: [[TMP5:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP2]]
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP5]], <32 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
index dd0dee422058c..b8802b2cd1c64 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
@@ -11,36 +11,33 @@ define dso_local i32 @check(ptr noundef readonly captures(none) %mask, ptr nound
; CHECK-NEXT: br i1 [[CMP13]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[MASK]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD2]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[RESULT]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP15:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD3]], splat (float 2.000000e+00)
-; CHECK-NEXT: [[NOT_:%.*]] = xor <vscale x 4 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[NOT_]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]]
-; CHECK-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: [[NARROW8:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI5:%.*]] = zext <vscale x 4 x i1> [[NARROW8]] to <vscale x 4 x i32>
-; CHECK-NEXT: [[BIN_RDX]] = add <vscale x 4 x i32> [[VEC_PHI]], [[PREDPHI5]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[FOR_BODY_PREHEADER]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP2]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[RESULT]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP6:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], splat (float 2.000000e+00)
+; CHECK-NEXT: [[NARROW4:%.*]] = select <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI2:%.*]] = zext <vscale x 4 x i1> [[NARROW4]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP7]] = add <vscale x 4 x i32> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT: br i1 [[TMP10]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP17:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[COUNT_0_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 9032c363eb936..693d13a78805a 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -21,19 +21,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
; CHECK-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
+; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP0]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT: [[OFFSET_IDX13:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX13]]
-; CHECK-NEXT: [[OFFSET_IDX15:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX15]]
+; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i16>, ptr [[NEXT_GEP16]], align 2
More information about the llvm-commits
mailing list