[clang] [llvm] [LLVM][PhaseOrdering] Run CSE after InstCombine has cleaned the result of vectorisation. (PR #120443)

Wed Dec 18 07:55:11 PST 2024

https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/120443

I revisited 58690 and was puzzled why the obvious combines do not fire. I believe the reason is combines that end with replaceOperand, because they can introduce duplicate expression trees (albeit tiny ones) which in turn blocks combines that compare expression pointers.
    
Perhaps adding the extra CSE run is too big of a hammer but I figured it exists as the first of the extra-vectorisation passes for a similar reason and thus promoting it to the main vectorisation pipeline seems reasonable? given the issues are now more prevalent as vectorisation capabilities increase.

>From 35989b3a5b78f1eacc13104d9fdb806eb92ac7ae Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 18 Dec 2024 13:17:13 +0000
Subject: [PATCH 1/2] Add test to show missing CSE.

---
 .../post-vectorisation-combines-with-cse.ll   | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
new file mode 100644
index 00000000000000..fbef1f3f6e7380
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='default<O2>' --prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local i32 @check(ptr nocapture noundef readonly %mask, ptr nocapture noundef readonly %result, i32 noundef %n) #0 {
+; CHECK-LABEL: define dso_local i32 @check(
+; CHECK-SAME: ptr nocapture noundef readonly [[MASK:%.*]], ptr nocapture noundef readonly [[RESULT:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[RESULT]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD3]], splat (float 2.000000e+00)
+; CHECK-NEXT:    [[NOT_:%.*]] = xor <vscale x 4 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[NOT_]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[NARROW8:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI5:%.*]] = zext <vscale x 4 x i1> [[NARROW8]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[BIN_RDX]] = add <vscale x 4 x i32> [[VEC_PHI]], [[PREDPHI5]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[COUNT_0_LCSSA]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %count.1.lcssa = phi i32 [ %count.1, %for.inc ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %count.1.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %count.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %count.014 = phi i32 [ 0, %for.body.preheader ], [ %count.1, %for.inc ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %mask, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  br i1 %cmp1, label %land.lhs.true, label %for.inc
+
+land.lhs.true:                                    ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds nuw float, ptr %result, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx4, align 4
+  %cmp6 = fcmp une float %1, 2.000000e+00
+  br i1 %cmp6, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %land.lhs.true
+  %inc = add nsw i32 %count.014, 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %land.lhs.true, %if.then
+  %count.1 = phi i32 [ %inc, %if.then ], [ %count.014, %land.lhs.true ], [ %count.014, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+v9a,+sve2" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.

>From 7d87ab14862814ce8a29a91487f509267d6e2422 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 18 Dec 2024 13:23:29 +0000
Subject: [PATCH 2/2] [LLVM][PhaseOrdering] Run CSE after InstCombine has
 cleaned the result of vectorisation.

I revisited 58690 and was puzzled why the obvious combines do not
fire. I believe the reason is combines that end with replaceOperand,
because they can introduce duplicate expression trees (albeit tiny
ones) which in turn blocks combines that compare expression pointers.

Perhaps adding the extra CSE run is too big of a hammer but I figured
it exists as the first of the extra-vectorisation passes for a similar
reason and thus promoting it to the main vectorisation pipeline seems
reasonable? given the issues are now more prevalent as vectorisation
capabilities increase.
---
 .../sme-intrinsics/acle_sme_ld1_vnum.c        | 48 +++++++------------
 .../sme-intrinsics/acle_sme_st1_vnum.c        | 48 +++++++------------
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  5 +-
 llvm/test/Other/new-pm-defaults.ll            |  1 +
 llvm/test/Other/new-pm-lto-defaults.ll        |  1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |  1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |  1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |  1 +
 llvm/test/Other/opt-pipeline-vector-passes.ll |  1 -
 .../LoopVectorize/X86/float-induction-x86.ll  | 10 +---
 ...ting-sinking-required-for-vectorization.ll |  3 +-
 .../AArch64/indvars-vectorization.ll          |  8 +---
 .../AArch64/interleavevectorization.ll        | 32 ++++---------
 .../post-vectorisation-combines-with-cse.ll   | 39 +++++++--------
 .../PhaseOrdering/ARM/arm_mult_q15.ll         | 12 ++---
 15 files changed, 76 insertions(+), 135 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
index fb86690f07f1d8..517f481a640927 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
@@ -15,8 +15,7 @@
 // CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
@@ -29,8 +28,7 @@
 // CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -49,8 +47,7 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -64,8 +61,7 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -84,8 +80,7 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -99,8 +94,7 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -119,8 +113,7 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -134,8 +127,7 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -184,8 +176,7 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr
 // CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
@@ -198,8 +189,7 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr
 // CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -218,8 +208,7 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -233,8 +222,7 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -253,8 +241,7 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -268,8 +255,7 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -288,8 +274,7 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -303,8 +288,7 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c
index dafc3d61a05f1b..5ed28e653ec59c 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c
@@ -15,8 +15,7 @@
 // CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
@@ -29,8 +28,7 @@
 // CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -49,8 +47,7 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -64,8 +61,7 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -84,8 +80,7 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -99,8 +94,7 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -119,8 +113,7 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -134,8 +127,7 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -184,8 +176,7 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6
 // CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
@@ -198,8 +189,7 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6
 // CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 15
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -218,8 +208,7 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -233,8 +222,7 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 7
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -253,8 +241,7 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -268,8 +255,7 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 3
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
@@ -288,8 +274,7 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
@@ -303,8 +288,7 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index d737ea5ab070a9..7cdb6f529a8116 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1306,6 +1306,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Cleanup after the loop optimization passes.
   FPM.addPass(InstCombinePass());
+  // InstCombine can create CSE opportunities when it cleans the result of loop
+  // vectorization. They occur when combines use replaceOperand, which happens
+  // most often when combining the boolean operations created by if-conversion.
+  FPM.addPass(EarlyCSEPass());
 
   if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
     ExtraFunctionPassManager<ShouldRunExtraVectorPasses> ExtraPasses;
@@ -1315,7 +1319,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     // common computations, hoist loop-invariant aspects out of any outer loop,
     // and unswitch the runtime checks if possible. Once hoisted, we may have
     // dead (or speculatable) control flows or more combining opportunities.
-    ExtraPasses.addPass(EarlyCSEPass());
     ExtraPasses.addPass(CorrelatedValuePropagationPass());
     ExtraPasses.addPass(InstCombinePass());
     LoopPassManager LPM;
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 7cf035b0c6f376..8fe823dbcac220 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -259,6 +259,7 @@
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index f788db1e338a1e..e6f05cdada3f6a 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -123,6 +123,7 @@
 ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: SROAPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: EarlyCSEPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: SimplifyCFGPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: SCCPPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index ed13402e1c4b15..268c3f46a5b5b0 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -181,6 +181,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index c82c34f7ff01e7..fb0601caa13080 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -166,6 +166,7 @@
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index d375747547d61f..d11d8d9ded01dc 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -175,6 +175,7 @@
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
diff --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll
index 83a9454c0d8087..4d277c31d9445f 100644
--- a/llvm/test/Other/opt-pipeline-vector-passes.ll
+++ b/llvm/test/Other/opt-pipeline-vector-passes.ll
@@ -19,7 +19,6 @@
 
 ; Everything runs at -O2.
 ; O2-LABEL:  Running pass: LoopVectorizePass
-; O2-NOT:    Running pass: EarlyCSEPass
 ; O2-NOT:    Running pass: LICMPass
 ; O2:        Running pass: SLPVectorizerPass
 ; O2:        Running pass: VectorCombinePass
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index ab0b45473a623a..5098e58dbe67be 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -54,9 +54,6 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]]
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AUTO_VEC:       vec.epilog.iter.check:
-; AUTO_VEC-NEXT:    [[DOTCAST7:%.*]] = uitofp nneg i64 [[N_VEC]] to float
-; AUTO_VEC-NEXT:    [[TMP6:%.*]] = fmul fast float [[DOTCAST7]], 5.000000e-01
-; AUTO_VEC-NEXT:    [[IND_END8:%.*]] = fadd fast float [[TMP6]], 1.000000e+00
 ; AUTO_VEC-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[ZEXT]], 28
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
@@ -85,7 +82,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N14]], label [[FOR_END]], label [[FOR_BODY]]
 ; AUTO_VEC:       for.body:
 ; AUTO_VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AUTO_VEC-NEXT:    [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; AUTO_VEC-NEXT:    [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; AUTO_VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
 ; AUTO_VEC-NEXT:    store float [[X_06]], ptr [[ARRAYIDX]], align 4
 ; AUTO_VEC-NEXT:    [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01
@@ -434,9 +431,6 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AUTO_VEC:       vec.epilog.iter.check:
-; AUTO_VEC-NEXT:    [[DOTCAST10:%.*]] = uitofp nneg i64 [[N_VEC]] to float
-; AUTO_VEC-NEXT:    [[TMP11:%.*]] = fmul reassoc float [[DOTCAST10]], 4.200000e+01
-; AUTO_VEC-NEXT:    [[IND_END11:%.*]] = fadd reassoc float [[TMP11]], 1.000000e+00
 ; AUTO_VEC-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 28
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
@@ -469,7 +463,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    ret void
 ; AUTO_VEC:       for.body:
 ; AUTO_VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AUTO_VEC-NEXT:    [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; AUTO_VEC-NEXT:    [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; AUTO_VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[P]], i64 [[INDVARS_IV]]
 ; AUTO_VEC-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; AUTO_VEC-NEXT:    [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP16]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 82b1cf90b47207..3b84be17646981 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -168,9 +168,8 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD11]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
-; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
+; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index 2f61c89241fd46..b926ac47f5463e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -31,13 +31,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP10]], i64 32000)
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i64 [[TMP10]], 32000
-; CHECK-NEXT:    [[UMIN:%.*]] = zext i1 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP10]], [[UMIN]]
-; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[SMAX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[UMIN]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[UMIN8]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 4
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
index be57809132d3fa..59d3576c749f60 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
@@ -141,11 +141,9 @@ define void @addsubs(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 2
@@ -267,12 +265,10 @@ define void @add2sub2(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP3]], <32 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -393,9 +389,9 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP4]], [[WIDE_VEC36]]
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -553,14 +549,9 @@ define void @addsubsmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z,
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <32 x i16> [[TMP7]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -718,14 +709,9 @@ define void @add2sub2mul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z,
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[TMP5]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP5]], <32 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
index fbef1f3f6e7380..a792947b0e544a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/post-vectorisation-combines-with-cse.ll
@@ -11,36 +11,33 @@ define dso_local i32 @check(ptr nocapture noundef readonly %mask, ptr nocapture
 ; CHECK-NEXT:    br i1 [[CMP13]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK:       [[FOR_BODY_PREHEADER]]:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[MASK]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[RESULT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP15:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD3]], splat (float 2.000000e+00)
-; CHECK-NEXT:    [[NOT_:%.*]] = xor <vscale x 4 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[NOT_]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[NARROW8:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI5:%.*]] = zext <vscale x 4 x i1> [[NARROW8]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[BIN_RDX]] = add <vscale x 4 x i32> [[VEC_PHI]], [[PREDPHI5]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[FOR_BODY_PREHEADER]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP2]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[RESULT]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], splat (float 2.000000e+00)
+; CHECK-NEXT:    [[NARROW4:%.*]] = select <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = zext <vscale x 4 x i1> [[NARROW4]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP7]] = add <vscale x 4 x i32> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
 ; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[COUNT_0_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index ef651010d6a905..8e1324c4f6c1a0 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -21,19 +21,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[OFFSET_IDX13:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX13]]
-; CHECK-NEXT:    [[OFFSET_IDX15:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX15]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i16>, ptr [[NEXT_GEP16]], align 2