[clang] [llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Tue Aug 27 09:42:00 PDT 2024

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From 88f20bcbddedece834e8cc203b8a3e003021a1e7 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 26 Jun 2024 09:55:45 +0100
Subject: [PATCH 01/11] [LV] Mask off possibly aliasing vector lanes

When vectorising a loop that uses loads and stores, those pointers could
overlap if their difference is less than the vector factor. For example,
if address 20 is being stored to and address 23 is being loaded from, they
overlap when the vector factor is 4 or higher. Currently LoopVectorize
branches to a scalar loop in these cases with a runtime check. Howver if
we construct a mask that disables the overlapping (aliasing) lanes then
the vectorised loop can be safely entered, as long as the loads and
stores are masked off.

This PR modifies the LoopVectorizer and VPlan to create such a mask and
always branch to the vector loop. Currently this is only done if we're
tail-predicating, but more work will come in the future to do this in
other cases as well.
---
 clang/test/CodeGen/loop-alias-mask.c          | 404 ++++++++++++++++++
 .../llvm/Analysis/LoopAccessAnalysis.h        |  16 +
 .../Vectorize/LoopVectorizationPlanner.h      |  19 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 142 ++++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |   2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  91 +++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  31 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |   5 +-
 .../LoopVectorize/AArch64/whilewr-opt.ll      | 369 ++++++++++++++++
 .../runtime-check-small-clamped-bounds.ll     |  22 +-
 .../runtime-checks-difference.ll              | 102 ++---
 11 files changed, 1097 insertions(+), 106 deletions(-)
 create mode 100644 clang/test/CodeGen/loop-alias-mask.c
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll

diff --git a/clang/test/CodeGen/loop-alias-mask.c b/clang/test/CodeGen/loop-alias-mask.c
new file mode 100644
index 00000000000000..76c3b5deddfa09
--- /dev/null
+++ b/clang/test/CodeGen/loop-alias-mask.c
@@ -0,0 +1,404 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang --target=aarch64-linux-gnu -march=armv9+sme2 -emit-llvm -S -g0 -O3 -mllvm -prefer-predicate-over-epilogue=predicate-dont-vectorize %s -o - | FileCheck %s
+#include <stdint.h>
+
+// CHECK-LABEL: define dso_local void @alias_mask_8(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]]), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_8(uint8_t *restrict a, uint8_t * b, uint8_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_16(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]]), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_16(uint16_t *restrict a, uint16_t * b, uint16_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_32(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]]), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_32(uint32_t *restrict a, uint32_t * b, uint32_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_64(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]]), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_64(uint64_t *restrict a, uint64_t * b, uint64_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_8(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
+// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], 0
+// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT21]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF18]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 16 x i1> [[TMP0]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP7]], ptr [[TMP8]], i32 1, <vscale x 16 x i1> [[TMP4]]), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_8(uint8_t * a, uint8_t * b, uint8_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_16(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
+// CHECK-NEXT:    [[DIFF19:%.*]] = sdiv i64 [[SUB_DIFF18]], 2
+// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], -1
+// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT21]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF19]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 8 x i1> [[TMP0]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP6]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP7]], ptr [[TMP8]], i32 2, <vscale x 8 x i1> [[TMP4]]), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_16(uint16_t * a, uint16_t * b, uint16_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_32(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
+// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 4
+// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -3
+// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT19]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF17]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 4 x i1> [[TMP0]] to <vscale x 4 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP4]]), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_32(uint32_t * a, uint32_t * b, uint32_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_64(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
+// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 8
+// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -7
+// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT19]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF17]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 2 x i1> [[TMP0]] to <vscale x 2 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[TMP4]]), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_64(uint64_t * a, uint64_t * b, uint64_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 73d9c26ed6b1b7..2cf3815123e667 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -16,7 +16,9 @@
 
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <optional>
 #include <variant>
 
@@ -442,6 +444,20 @@ struct PointerDiffInfo {
         NeedsFreeze(NeedsFreeze) {}
 };
 
+/// A pair of pointers that could overlap across a loop iteration.
+struct PointerDiffInfoValues {
+  /// The pointer being read from
+  Value *Src;
+  /// The pointer being stored to
+  Value *Sink;
+
+  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
+                        SCEVExpander Exp, Instruction *Loc)
+      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
+        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
+  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
+};
+
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index b5f87e458833d6..debf00c904e895 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -26,6 +26,7 @@
 
 #include "VPlan.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
@@ -363,7 +364,8 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
-  void plan(ElementCount UserVF, unsigned UserIC);
+  void plan(ElementCount UserVF, unsigned UserIC,
+       SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -441,12 +443,23 @@ class LoopVectorizationPlanner {
   /// returned VPlan is valid for. If no VPlan can be built for the input range,
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built.
-  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  VPlanPtr
+  tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                               SmallVector<PointerDiffInfoValues> RTChecks,
+                               bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                SmallVector<PointerDiffInfoValues> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6850b502939f58..44c75fdc6fdd94 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1822,6 +1822,10 @@ class GeneratedRTChecks {
   Loop *OuterLoop = nullptr;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
                     bool AddBranchWeights)
@@ -1862,9 +1866,11 @@ class GeneratedRTChecks {
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
     if (RtPtrChecking.Need) {
-      auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
-      MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
-                                 "vector.memcheck");
+      if (!MemCheckBlock) {
+        auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
+        MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
+                                   "vector.memcheck");
+      }
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
       if (DiffChecks) {
@@ -1922,6 +1928,10 @@ class GeneratedRTChecks {
     OuterLoop = L->getParentLoop();
   }
 
+  Value *expandCodeForMemCheck(const SCEV *Scev, Instruction *Loc) {
+    return MemCheckExp.expandCodeFor(Scev, Scev->getType(), Loc);
+  }
+
   InstructionCost getCost() {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
@@ -2096,11 +2106,18 @@ class GeneratedRTChecks {
     if (OuterLoop)
       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
 
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
-    if (AddBranchWeights) {
+    // TODO: Branch to the vector preheader conditionally based on the number of
+    // non-aliasing elements. The scalar loop will likely be better if only one
+    // or two elements will be processed per vectorised loop iteration.
+
+    // Jump to the vector preheader unconditionally if it's safe to do so
+    // because an alias mask has been set up.
+    BranchInst &BI = HasAliasMask
+                         ? *BranchInst::Create(LoopVectorPreHeader)
+                         : *BranchInst::Create(Bypass, LoopVectorPreHeader,
+                                               MemRuntimeCheckCond);
+    if (!HasAliasMask && AddBranchWeights)
       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
-    }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
         Pred->getTerminator()->getDebugLoc());
@@ -2569,7 +2586,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
+  /// If an alias mask has been set up then we don't need the bypass as the
+  /// vector preheader will be branched to unconditionally
+  if (!RTChecks.HasAliasMask)
+    LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -6985,7 +7005,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
+                               SmallVector<PointerDiffInfoValues> RTChecks,
+                               bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -7026,7 +7048,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF);
+        buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       } else
@@ -7055,8 +7077,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           RTChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           RTChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -7552,7 +7576,6 @@ LoopVectorizationPlanner::executePlan(
                              CanonicalIVStartValue, State);
 
   BestVPlan.execute(&State);
-
   // 2.5 Collect reduction resume values.
   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
   auto *ExitVPBB =
@@ -7786,7 +7809,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   // reduction phis in the scalar loop preheader.
   if (EPI.SCEVSafetyCheck)
     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
+  if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
@@ -8035,6 +8058,7 @@ void VPRecipeBuilder::createHeaderMask() {
   // constructing the desired canonical IV in the header block as its first
   // non-phi instructions.
 
+  VPValue *BlockMask = nullptr;
   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
@@ -8042,7 +8066,6 @@ void VPRecipeBuilder::createHeaderMask() {
 
   VPBuilder::InsertPointGuard Guard(Builder);
   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
-  VPValue *BlockMask = nullptr;
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   BlockMaskCache[Header] = BlockMask;
@@ -8534,14 +8557,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF,
+    SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+    if (auto Plan =
+            tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
       // Now optimize the initial VPlan.
       if (!Plan->hasVF(ElementCount::getFixed(1)))
         VPlanTransforms::truncateToMinimalBitwidths(
@@ -8562,7 +8587,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
 // Add the necessary canonical IV and branch recipes required to control the
 // loop.
 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
-                                  DebugLoc DL) {
+                                  DebugLoc DL, VPValue *AliasMask) {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
 
@@ -8573,9 +8598,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
   Header->insert(CanonicalIVPHI, Header->begin());
 
   VPBuilder Builder(TopRegion->getExitingBasicBlock());
-  // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+  // Add a VPInstruction to increment the scalar canonical IV by VF * UF, or the
+  // popcount of the alias mask if there is one
+  VPValue *IncrementBy = &Plan.getVFxUF();
+  if (AliasMask) {
+    IncrementBy = Builder.createNaryOp(VPInstruction::PopCount, {AliasMask}, DL,
+                                       "popcount");
+    auto *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64) {
+      auto *Cast =
+          new VPScalarCastRecipe(Instruction::Trunc, IncrementBy, IVType);
+      Cast->insertAfter(IncrementBy->getDefiningRecipe());
+      IncrementBy = Cast;
+    }
+  }
+
   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
-      Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
+      Instruction::Add, {CanonicalIVPHI, IncrementBy}, {HasNUW, false}, DL,
       "index.next");
   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
 
@@ -8813,8 +8853,9 @@ static void addLiveOutsForFirstOrderRecurrences(
   }
 }
 
-VPlanPtr
-LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+    VFRange &Range, SmallVector<PointerDiffInfoValues> RTChecks,
+    bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -8853,7 +8894,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // When not folding the tail, we know that the induction increment will not
   // overflow.
   bool HasNUW = Style == TailFoldingStyle::None;
-  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+
+  VPValue *AliasMask = nullptr;
+  if (useActiveLaneMask(Style)) {
+    // Create an alias mask for each possibly-aliasing pointer pair. If there
+    // are multiple they are combined together with ANDs.
+    VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+    auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
+    VPBuilder Builder(VecPreheader);
+    for (auto C : RTChecks) {
+      HasAliasMask = true;
+      VPValue *Sink = Plan->getOrAddLiveIn(C.Sink);
+      VPValue *Src = Plan->getOrAddLiveIn(C.Src);
+      VPValue *M =
+          Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
+                               "active.lane.mask.alias");
+      if (AliasMask)
+        AliasMask = Builder.createAnd(AliasMask, M);
+      else
+        AliasMask = M;
+    }
+  }
+  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL,
+                        AliasMask);
 
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
 
@@ -9067,7 +9130,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, AliasMask);
   }
   return Plan;
 }
@@ -9107,7 +9170,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   // is guaranteed to not wrap.
   bool HasNUW = true;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
-                        DebugLoc());
+                        DebugLoc(), nullptr);
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
@@ -9610,6 +9673,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -9932,18 +9996,34 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  bool AddBranchWeights =
+      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
+                           F->getDataLayout(), AddBranchWeights);
+
+  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
+  // here and put them into a list.
+  std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
+      LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
+  SmallVector<PointerDiffInfoValues> DiffChecksValues;
+  if (DiffChecks.has_value() &&
+      useActiveLaneMask(CM.getTailFoldingStyle(true))) {
+    Instruction *Loc = L->getLoopPreheader()->getTerminator();
+    for (auto Check : *DiffChecks) {
+      Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
+      Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
+      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
+    }
+  }
+
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC);
+  LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  bool AddBranchWeights =
-      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                           F->getDataLayout(), AddBranchWeights);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 36a1aa08654d5b..ecdd8a0283cbfd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1243,6 +1243,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    AliasLaneMask,
     ExplicitVectorLength,
     /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
     /// The first operand is the incoming value from the predecessor in VPlan,
@@ -1262,6 +1263,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     // scalar.
     ExtractFromEnd,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 53b28a692059f6..6606383df6e2ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -18,9 +18,11 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
@@ -357,6 +359,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -422,6 +425,85 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  case VPInstruction::AliasLaneMask: {
+    // Given a pointer A that is being stored to, and pointer B that is being
+    // read from, both with unknown lengths, create a mask that disables
+    // elements which could overlap across a loop iteration. For example, if A
+    // is X and B is X + 2 with VF being 4, only the final two elements of the
+    // loaded vector can be stored since they don't overlap with the stored
+    // vector. %b.vec = load %b ; = [s, t, u, v]
+    // [...]
+    // store %a, %b.vec ; only u and v can be stored as their addresses don't
+    // overlap with %a + (VF - 1)
+    Value *ReadPtr = State.get(getOperand(0), VPIteration(Part, 0));
+    Value *StorePtr = State.get(getOperand(1), VPIteration(Part, 0));
+    unsigned ElementSize = 0;
+
+    // We expect the operands to the alias mask to be ptrtoint. Sometimes it's
+    // an add of a ptrtoint.
+    auto *ReadInsn = cast<Instruction>(ReadPtr);
+    auto *ReadCast = dyn_cast<CastInst>(ReadPtr);
+    if (ReadInsn->getOpcode() == Instruction::Add)
+      ReadCast = dyn_cast<CastInst>(ReadInsn->getOperand(0));
+
+    if (ReadCast && ReadCast->getOpcode() == Instruction::PtrToInt) {
+      Value *Ptr = ReadCast->getOperand(0);
+      for (auto *Use : Ptr->users()) {
+        if (auto *GEP = dyn_cast<GetElementPtrInst>(Use)) {
+          auto *EltVT = GEP->getSourceElementType();
+          if (EltVT->isArrayTy())
+            ElementSize = EltVT->getArrayElementType()->getScalarSizeInBits() *
+                          EltVT->getArrayNumElements();
+          else
+            ElementSize =
+                GEP->getSourceElementType()->getScalarSizeInBits() / 8;
+          break;
+        }
+      }
+    }
+    assert(ElementSize > 0 && "Couldn't get element size from pointer");
+    // Calculate how many elements the pointers differ by
+    Value *Diff = Builder.CreateSub(StorePtr, ReadPtr, "sub.diff");
+    auto *Type = Diff->getType();
+    Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+    Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+    // If the difference is negative then some elements may alias
+    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, DiffDiv,
+                                    ConstantInt::get(Type, 0), "neg.compare");
+    // Splat the compare result then OR it with a lane mask
+    Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+    Value *DiffMask = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask,
+        {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+        {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+    return Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat,
+                               Name);
+  }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    if (Part != 0)
+      return State.get(this, 0, /*IsScalar*/ true);
+    Value *Op = State.get(getOperand(0), Part);
+    auto *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -675,7 +757,8 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -812,6 +895,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
+  case VPInstruction::AliasLaneMask:
+    O << "alias lane mask";
+    break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8deded031dc391..adf69f2c4e217e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1194,8 +1194,10 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *
+addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan,
+                                    bool DataAndControlFlowWithoutRuntimeCheck,
+                                    VPValue *AliasMask) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1236,14 +1238,22 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // And the alias mask so the iteration only processed non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -1262,7 +1272,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
@@ -1312,23 +1322,24 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, VPValue *AliasMask) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto FoundWidenCanonicalIVUser =
+  auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  assert(FoundWidenCanonicalIVUser && *FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, AliasMask);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..e82226ac813c7f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -73,9 +73,12 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                VPValue *AliasMask);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll
new file mode 100644
index 00000000000000..b3fb78df060820
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -mtriple=aarch64-linux-gnu -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
+define dso_local void @whilewr_8(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_8(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b15, %c14
+  %neg.compare = icmp slt i64 %sub.diff, 0
+  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i8, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+  %5 = getelementptr inbounds i8, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+  %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
+  %7 = getelementptr inbounds i8, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_16(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_16(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b15, %c14
+  %diff = sdiv i64 %sub.diff, 2
+  %neg.compare = icmp slt i64 %sub.diff, -1
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 8 x i1> %active.lane.mask.alias to <vscale x 8 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 8 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %3, <vscale x 8 x i16> poison)
+  %5 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %3, <vscale x 8 x i16> poison)
+  %6 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
+  %7 = getelementptr inbounds i16, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %6, ptr %7, i32 2, <vscale x 8 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_32(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_32(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c12 = ptrtoint ptr %c to i64
+  %b13 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b13, %c12
+  %diff = sdiv i64 %sub.diff, 4
+  %neg.compare = icmp slt i64 %sub.diff, -3
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 4 x i1> %active.lane.mask.alias to <vscale x 4 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 4 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %3, <vscale x 4 x i32> poison)
+  %5 = getelementptr inbounds i32, ptr %b, i64 %index
+  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %3, <vscale x 4 x i32> poison)
+  %6 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
+  %7 = getelementptr inbounds i32, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %6, ptr %7, i32 4, <vscale x 4 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_64(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_64(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c12 = ptrtoint ptr %c to i64
+  %b13 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b13, %c12
+  %diff = sdiv i64 %sub.diff, 8
+  %neg.compare = icmp slt i64 %sub.diff, -7
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 2 x i1> %active.lane.mask.alias to <vscale x 2 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 2 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %3, <vscale x 2 x i64> poison)
+  %5 = getelementptr inbounds i64, ptr %b, i64 %index
+  %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %3, <vscale x 2 x i64> poison)
+  %6 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
+  %7 = getelementptr inbounds i64, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %6, ptr %7, i32 8, <vscale x 2 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1
+
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>) #2
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8>) #4
+
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1
+
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>) #2
+
+declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8>) #4
+
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1
+
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #2
+
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8>) #4
+
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1
+
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) #2
+
+declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8>) #4
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index e7e63e55802fe1..53ef470e098d0d 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -185,18 +185,18 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i32 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index 35ece2fe6eacd0..742788a6ba62ff 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -8,11 +8,11 @@ define void @same_step_and_size(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -38,11 +38,11 @@ define void @same_step_and_size_no_dominance_between_accesses(ptr %a, ptr %b, i6
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -75,16 +75,16 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-LABEL: @different_steps_and_different_access_sizes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    [[N_SHL_1:%.]] = shl i64 %n, 1
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr %a, i64 [[N_SHL_1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -111,12 +111,12 @@ define void @steps_match_but_different_access_sizes_1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[B1]], -2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -145,12 +145,12 @@ define void @steps_match_but_different_access_sizes_2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[A1]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -176,25 +176,27 @@ exit:
 ; Test case for PR57315.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
+
+
 entry:
   br label %outer.header
 
@@ -226,25 +228,27 @@ exit:
 ; sink and source swapped.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner2(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
+
+
 entry:
   br label %outer.header
 
@@ -280,15 +284,15 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N:%.*]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:

>From af68498b34554dd99bcf9a9787e79baa270b580c Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 5 Aug 2024 13:43:08 +0100
Subject: [PATCH 02/11] Move PointerDiffInfoValues to VPlan.h

---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 14 --------------
 llvm/lib/Transforms/Vectorize/VPlan.h           | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 2cf3815123e667..fe378ca4371c36 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -444,20 +444,6 @@ struct PointerDiffInfo {
         NeedsFreeze(NeedsFreeze) {}
 };
 
-/// A pair of pointers that could overlap across a loop iteration.
-struct PointerDiffInfoValues {
-  /// The pointer being read from
-  Value *Src;
-  /// The pointer being stored to
-  Value *Sink;
-
-  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
-                        SCEVExpander Exp, Instruction *Loc)
-      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
-        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
-  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
-};
-
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ecdd8a0283cbfd..60283a64524cc4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3843,6 +3843,20 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
 bool isHeaderMask(const VPValue *V, VPlan &Plan);
 } // end namespace vputils
 
+/// A pair of pointers that could overlap across a loop iteration.
+struct PointerDiffInfoValues {
+  /// The pointer being read from
+  Value *Src;
+  /// The pointer being stored to
+  Value *Sink;
+
+  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
+                        SCEVExpander Exp, Instruction *Loc)
+      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
+        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
+  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
+};
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H

>From 1a95a4739e8b82b246affd17de10b35303e38668 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 6 Aug 2024 13:48:17 +0100
Subject: [PATCH 03/11] Move SCEV expansion so it's done after we know if we're
 tail-folding or not

---
 .../Vectorize/LoopVectorizationPlanner.h      |  6 ++++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 27 ++++++++++++++-----
 .../AArch64/induction-costs-sve.ll            | 21 +++++++++++----
 .../LoopVectorize/ARM/scalar-block-cost.ll    |  4 +--
 4 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index debf00c904e895..85ba2de46a2fb1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -364,8 +364,12 @@ class LoopVectorizationPlanner {
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
   void plan(ElementCount UserVF, unsigned UserIC,
-       SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
+       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, std::function<Value*(const SCEV *)> Expander, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 44c75fdc6fdd94..8c73edd71da82d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7006,8 +7006,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 }
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
-                               SmallVector<PointerDiffInfoValues> RTChecks,
-                               bool &HasAliasMask) {
+                               std::optional<ArrayRef<PointerDiffInfo>> RTChecks,
+                               std::function<Value*(const SCEV*)> Expander, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -7016,6 +7016,18 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
+  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
+  // here and put them into a list.
+  SmallVector<PointerDiffInfoValues> DiffChecksValues;
+  if (RTChecks.has_value()
+      && useActiveLaneMask(CM.getTailFoldingStyle(true))) {
+    for (auto Check : *RTChecks) {
+      Value *Sink = Expander(Check.SinkStart);
+      Value *Src = Expander(Check.SrcStart);
+      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
+    }
+  }
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -7048,7 +7060,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecksValues, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       } else
@@ -7078,9 +7090,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           RTChecks, HasAliasMask);
+                           DiffChecksValues, HasAliasMask);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           RTChecks, HasAliasMask);
+                           DiffChecksValues, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -10017,7 +10029,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
+  auto Expand = [&Checks, &L](const SCEV *S) {
+    return Checks.expandCodeForMemCheck(S, L->getLoopPreheader()->getTerminator());
+  };
+  LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
   unsigned IC = 1;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index edba5ee1d7f9eb..b8053bb31b58e3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -153,7 +153,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -163,6 +163,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[DIFF]], 0
+; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -177,9 +184,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
 ; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
 ; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -188,8 +196,11 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
+; PRED-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
+; PRED-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
@@ -197,7 +208,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e9f094de..b381392ebf9f16 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -9,8 +9,8 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add = add nuw nsw i32 %i.09, 1
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %data, i32 %add
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i32, ptr %arrayidx, align 4
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %0, 5
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i32, ptr %arrayidx, align 4
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %1, 5
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %i.09
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add1, ptr %arrayidx2, align 4
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %add, %n

>From 6447833a840be309224db8bc359862c76affdb21 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 10:29:17 +0100
Subject: [PATCH 04/11] Add statistic variable

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c73edd71da82d..33000cff03291f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -179,6 +179,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -10034,6 +10035,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   };
   LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
   unsigned IC = 1;
 
   if (ORE->allowExtraAnalysis(LV_NAME))

>From 43205ead4984ab274b525841370872aa60f1f0f7 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 16:33:17 +0100
Subject: [PATCH 05/11] Remove SCEV expander header include move

---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index fe378ca4371c36..5a596bd333ca26 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -17,8 +17,8 @@
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <optional>
 #include <variant>
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 60283a64524cc4..b9a18db36e24ca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -43,6 +43,7 @@
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/InstructionCost.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>

>From 0c2efafda44b88ef3b7d27fdb43909b282746c3e Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 16:39:58 +0100
Subject: [PATCH 06/11] Remove unnecessary changes from
 runtime-checks-difference.ll

---
 .../runtime-checks-difference.ll              | 52 +++++++++----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index 742788a6ba62ff..6795f654cd7daf 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -8,11 +8,11 @@ define void @same_step_and_size(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -38,11 +38,11 @@ define void @same_step_and_size_no_dominance_between_accesses(ptr %a, ptr %b, i6
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -75,16 +75,16 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-LABEL: @different_steps_and_different_access_sizes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 %n, 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr %a, i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -111,12 +111,12 @@ define void @steps_match_but_different_access_sizes_1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[B1]], -2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -145,12 +145,12 @@ define void @steps_match_but_different_access_sizes_2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[A1]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -176,10 +176,11 @@ exit:
 ; Test case for PR57315.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:        entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 %n, 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    br label %outer
+
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
@@ -193,10 +194,8 @@ define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
-
-
 entry:
   br label %outer.header
 
@@ -240,15 +239,14 @@ define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
-
-
 entry:
   br label %outer.header
 
@@ -288,7 +286,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:

>From 91318bb5e48b9078ea1832e575870019c638ac70 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 16:40:58 +0100
Subject: [PATCH 07/11] Remove clang test

---
 clang/test/CodeGen/loop-alias-mask.c | 404 ---------------------------
 1 file changed, 404 deletions(-)
 delete mode 100644 clang/test/CodeGen/loop-alias-mask.c

diff --git a/clang/test/CodeGen/loop-alias-mask.c b/clang/test/CodeGen/loop-alias-mask.c
deleted file mode 100644
index 76c3b5deddfa09..00000000000000
--- a/clang/test/CodeGen/loop-alias-mask.c
+++ /dev/null
@@ -1,404 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang --target=aarch64-linux-gnu -march=armv9+sme2 -emit-llvm -S -g0 -O3 -mllvm -prefer-predicate-over-epilogue=predicate-dont-vectorize %s -o - | FileCheck %s
-#include <stdint.h>
-
-// CHECK-LABEL: define dso_local void @alias_mask_8(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]]), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP9:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_8(uint8_t *restrict a, uint8_t * b, uint8_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_16(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]]), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP15:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_16(uint16_t *restrict a, uint16_t * b, uint16_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_32(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]]), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP18:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_32(uint32_t *restrict a, uint32_t * b, uint32_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_64(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]]), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP21:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_64(uint64_t *restrict a, uint64_t * b, uint64_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_8(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
-// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], 0
-// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT21]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF18]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 16 x i1> [[TMP0]] to <vscale x 16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP7]], ptr [[TMP8]], i32 1, <vscale x 16 x i1> [[TMP4]]), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP22:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_8(uint8_t * a, uint8_t * b, uint8_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_16(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
-// CHECK-NEXT:    [[DIFF19:%.*]] = sdiv i64 [[SUB_DIFF18]], 2
-// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], -1
-// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT21]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF19]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 8 x i1> [[TMP0]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP6]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP7]], ptr [[TMP8]], i32 2, <vscale x 8 x i1> [[TMP4]]), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP23:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_16(uint16_t * a, uint16_t * b, uint16_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_32(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
-// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 4
-// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -3
-// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT19]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF17]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 4 x i1> [[TMP0]] to <vscale x 4 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP4]]), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP24:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_32(uint32_t * a, uint32_t * b, uint32_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_64(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
-// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 8
-// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -7
-// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT19]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF17]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 2 x i1> [[TMP0]] to <vscale x 2 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[TMP4]]), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP25:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_64(uint64_t * a, uint64_t * b, uint64_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}

>From 4c3da47db813e17055b54fd18b4405a4f0adde55 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 8 Aug 2024 16:17:50 +0100
Subject: [PATCH 08/11] Use VPExpandSCEVRecipe

---
 .../Vectorize/LoopVectorizationPlanner.h      | 11 ++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 58 +++++--------------
 .../AArch64/induction-costs-sve.ll            |  4 +-
 .../LoopVectorize/ARM/scalar-block-cost.ll    |  4 +-
 4 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 85ba2de46a2fb1..8099b95de77e1b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -369,7 +369,7 @@ class LoopVectorizationPlanner {
   /// and the vector loop should be entered even if the pointers alias across a
   /// loop iteration.
   void plan(ElementCount UserVF, unsigned UserIC,
-       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, std::function<Value*(const SCEV *)> Expander, bool &HasAliasMask);
+       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -451,10 +451,9 @@ class LoopVectorizationPlanner {
   /// setting HasAliasMask to true in the case that an alias mask is generated
   /// and the vector loop should be entered even if the pointers alias across a
   /// loop iteration.
-  VPlanPtr
-  tryToBuildVPlanWithVPRecipes(VFRange &Range,
-                               SmallVector<PointerDiffInfoValues> RTChecks,
-                               bool &HasAliasMask);
+  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
@@ -462,7 +461,7 @@ class LoopVectorizationPlanner {
   /// RTChecks contains a list of pointer pairs that an alias mask should be
   /// generated for.
   void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
-                                SmallVector<PointerDiffInfoValues> RTChecks,
+                                ArrayRef<PointerDiffInfo> RTChecks,
                                 bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33000cff03291f..57009e9243e85f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1929,10 +1929,6 @@ class GeneratedRTChecks {
     OuterLoop = L->getParentLoop();
   }
 
-  Value *expandCodeForMemCheck(const SCEV *Scev, Instruction *Loc) {
-    return MemCheckExp.expandCodeFor(Scev, Scev->getType(), Loc);
-  }
-
   InstructionCost getCost() {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
@@ -7007,8 +7003,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 }
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
-                               std::optional<ArrayRef<PointerDiffInfo>> RTChecks,
-                               std::function<Value*(const SCEV*)> Expander, bool &HasAliasMask) {
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -7019,15 +7014,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
 
   // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
   // here and put them into a list.
-  SmallVector<PointerDiffInfoValues> DiffChecksValues;
-  if (RTChecks.has_value()
-      && useActiveLaneMask(CM.getTailFoldingStyle(true))) {
-    for (auto Check : *RTChecks) {
-      Value *Sink = Expander(Check.SinkStart);
-      Value *Src = Expander(Check.SrcStart);
-      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
-    }
-  }
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+    DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
@@ -7061,7 +7050,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
       CM.collectInLoopReductions();
       if (CM.selectUserVectorizationFactor(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecksValues, HasAliasMask);
+        buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
       } else
@@ -7091,9 +7080,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           DiffChecksValues, HasAliasMask);
+                           DiffChecks, HasAliasMask);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           DiffChecksValues, HasAliasMask);
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -8571,8 +8560,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
-    ElementCount MinVF, ElementCount MaxVF,
-    SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask) {
+    ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
@@ -8867,8 +8856,7 @@ static void addLiveOutsForFirstOrderRecurrences(
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VFRange &Range, SmallVector<PointerDiffInfoValues> RTChecks,
-    bool &HasAliasMask) {
+    VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -8917,8 +8905,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     VPBuilder Builder(VecPreheader);
     for (auto C : RTChecks) {
       HasAliasMask = true;
-      VPValue *Sink = Plan->getOrAddLiveIn(C.Sink);
-      VPValue *Src = Plan->getOrAddLiveIn(C.Src);
+      VPValue *Sink = vputils::getOrCreateVPValueForSCEVExpr(*Plan, C.SinkStart,
+                                                             *PSE.getSE());
+      VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(*Plan, C.SrcStart,
+                                                            *PSE.getSE());
       VPValue *M =
           Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
                                "active.lane.mask.alias");
@@ -10014,26 +10004,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
                            F->getDataLayout(), AddBranchWeights);
 
-  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
-  // here and put them into a list.
-  std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
-      LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
-  SmallVector<PointerDiffInfoValues> DiffChecksValues;
-  if (DiffChecks.has_value() &&
-      useActiveLaneMask(CM.getTailFoldingStyle(true))) {
-    Instruction *Loc = L->getLoopPreheader()->getTerminator();
-    for (auto Check : *DiffChecks) {
-      Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
-      Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
-      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
-    }
-  }
-
   // Plan how to best vectorize.
-  auto Expand = [&Checks, &L](const SCEV *S) {
-    return Checks.expandCodeForMemCheck(S, L->getLoopPreheader()->getTerminator());
-  };
-  LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
+  LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
   if (Checks.HasAliasMask)
     LoopsAliasMasked++;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index b8053bb31b58e3..a2c55461318984 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -146,12 +146,14 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index b381392ebf9f16..596e42e9f094de 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -9,8 +9,8 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add = add nuw nsw i32 %i.09, 1
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %data, i32 %add
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i32, ptr %arrayidx, align 4
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %1, 5
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i32, ptr %arrayidx, align 4
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %0, 5
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %i.09
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add1, ptr %arrayidx2, align 4
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %add, %n

>From a5c7d7d7b1f26313e231946a6ab1bbd0f8b5cb62 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 14 Aug 2024 09:39:39 +0100
Subject: [PATCH 09/11] Make alias lane mask a recipe

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  40 ++++++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 111 +++++++++---------
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 4 files changed, 96 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 57009e9243e85f..4d2eccdc7b6b63 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8909,9 +8909,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                                                              *PSE.getSE());
       VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(*Plan, C.SrcStart,
                                                             *PSE.getSE());
-      VPValue *M =
-          Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
-                               "active.lane.mask.alias");
+      VPAliasLaneMaskRecipe *M = new VPAliasLaneMaskRecipe(Src, Sink);
+      VecPreheader->appendRecipe(M);
       if (AliasMask)
         AliasMask = Builder.createAnd(AliasMask, M);
       else
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b9a18db36e24ca..5a8f95f04ed389 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -912,6 +912,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
@@ -1244,7 +1245,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
-    AliasLaneMask,
     ExplicitVectorLength,
     /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
     /// The first operand is the incoming value from the predecessor in VPlan,
@@ -2698,6 +2698,44 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being stored to, and pointer B that is being
+// read from, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the final two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %b.vec = load %b ; = [s, t, u, v]
+// [...]
+// store %a, %b.vec ; only u and v can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6606383df6e2ee..51a2b45f94cc29 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -425,60 +425,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
-  case VPInstruction::AliasLaneMask: {
-    // Given a pointer A that is being stored to, and pointer B that is being
-    // read from, both with unknown lengths, create a mask that disables
-    // elements which could overlap across a loop iteration. For example, if A
-    // is X and B is X + 2 with VF being 4, only the final two elements of the
-    // loaded vector can be stored since they don't overlap with the stored
-    // vector. %b.vec = load %b ; = [s, t, u, v]
-    // [...]
-    // store %a, %b.vec ; only u and v can be stored as their addresses don't
-    // overlap with %a + (VF - 1)
-    Value *ReadPtr = State.get(getOperand(0), VPIteration(Part, 0));
-    Value *StorePtr = State.get(getOperand(1), VPIteration(Part, 0));
-    unsigned ElementSize = 0;
-
-    // We expect the operands to the alias mask to be ptrtoint. Sometimes it's
-    // an add of a ptrtoint.
-    auto *ReadInsn = cast<Instruction>(ReadPtr);
-    auto *ReadCast = dyn_cast<CastInst>(ReadPtr);
-    if (ReadInsn->getOpcode() == Instruction::Add)
-      ReadCast = dyn_cast<CastInst>(ReadInsn->getOperand(0));
-
-    if (ReadCast && ReadCast->getOpcode() == Instruction::PtrToInt) {
-      Value *Ptr = ReadCast->getOperand(0);
-      for (auto *Use : Ptr->users()) {
-        if (auto *GEP = dyn_cast<GetElementPtrInst>(Use)) {
-          auto *EltVT = GEP->getSourceElementType();
-          if (EltVT->isArrayTy())
-            ElementSize = EltVT->getArrayElementType()->getScalarSizeInBits() *
-                          EltVT->getArrayNumElements();
-          else
-            ElementSize =
-                GEP->getSourceElementType()->getScalarSizeInBits() / 8;
-          break;
-        }
-      }
-    }
-    assert(ElementSize > 0 && "Couldn't get element size from pointer");
-    // Calculate how many elements the pointers differ by
-    Value *Diff = Builder.CreateSub(StorePtr, ReadPtr, "sub.diff");
-    auto *Type = Diff->getType();
-    Value *MemEltSize = ConstantInt::get(Type, ElementSize);
-    Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
-    // If the difference is negative then some elements may alias
-    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, DiffDiv,
-                                    ConstantInt::get(Type, 0), "neg.compare");
-    // Splat the compare result then OR it with a lane mask
-    Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
-    Value *DiffMask = Builder.CreateIntrinsic(
-        Intrinsic::get_active_lane_mask,
-        {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
-        {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
-    return Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat,
-                               Name);
-  }
   // Count the number of bits set in each lane and reduce the result to a scalar
   case VPInstruction::PopCount: {
     if (Part != 0)
@@ -895,9 +841,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
-  case VPInstruction::AliasLaneMask:
-    O << "alias lane mask";
-    break;
   case VPInstruction::PopCount:
     O << "popcount";
     break;
@@ -2867,6 +2810,60 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), 0, true);
+  Value *SourceValue = State.get(getSourceValue(), 0, true);
+
+  unsigned ElementSize = 0;
+  auto *ReadInsn = cast<Instruction>(SourceValue);
+  auto *ReadCast = dyn_cast<CastInst>(SourceValue);
+  if (ReadInsn->getOpcode() == Instruction::Add)
+    ReadCast = dyn_cast<CastInst>(ReadInsn->getOperand(0));
+
+  if (ReadCast && ReadCast->getOpcode() == Instruction::PtrToInt) {
+    Value *Ptr = ReadCast->getOperand(0);
+    for (auto *Use : Ptr->users()) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(Use)) {
+        auto *EltVT = GEP->getSourceElementType();
+        if (EltVT->isArrayTy())
+          ElementSize = EltVT->getArrayElementType()->getScalarSizeInBits() *
+                        EltVT->getArrayNumElements();
+        else
+          ElementSize = GEP->getSourceElementType()->getScalarSizeInBits() / 8;
+        break;
+      }
+    }
+  }
+  assert(ElementSize > 0 && "Couldn't get element size from pointer");
+
+  Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
+  auto *Type = Diff->getType();
+  Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+  Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+  // If the difference is negative then some elements may alias
+  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, DiffDiv,
+                                  ConstantInt::get(Type, 0), "neg.compare");
+  // Splat the compare result then OR it with a lane mask
+  Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+  Value *DiffMask = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+      {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+  Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
+  State.set(this, Or, 0, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "ALIAS-LANE-MASK ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a773..32b21ddfeb7710 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -338,6 +338,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPInstructionSC,
     VPInterleaveSC,

>From 783470800c2d18ba8be4e2f8ee728e6c42887746 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 20 Aug 2024 10:19:50 +0100
Subject: [PATCH 10/11] Improve opt test

---
 .../LoopVectorize/AArch64/alias_mask.ll       | 116 ++++++
 .../LoopVectorize/AArch64/whilewr-opt.ll      | 369 ------------------
 2 files changed, 116 insertions(+), 369 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 00000000000000..ac6c4c534744e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B4:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[C1]], [[B2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP5]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B4]], [[C3]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <vscale x 16 x i1> [[TMP8]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP24]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP29]], [[TMP28]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+  store i8 %add, ptr %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll
deleted file mode 100644
index b3fb78df060820..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll
+++ /dev/null
@@ -1,369 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=loop-vectorize -mtriple=aarch64-linux-gnu -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
-define dso_local void @whilewr_8(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
-; CHECK-LABEL: define dso_local void @whilewr_8(
-; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
-; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
-  %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
-  %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
-  %2 = zext i8 %1 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
-  %4 = getelementptr inbounds i8, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
-  %5 = getelementptr inbounds i8, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
-  %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
-  %7 = getelementptr inbounds i8, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
-  %index.next = add i64 %index, %2
-  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
-  br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define dso_local void @whilewr_16(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
-; CHECK-LABEL: define dso_local void @whilewr_16(
-; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
-; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
-; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %b15, %c14
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
-  %0 = zext <vscale x 8 x i1> %active.lane.mask.alias to <vscale x 8 x i8>
-  %1 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %0)
-  %2 = zext i8 %1 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = and <vscale x 8 x i1> %active.lane.mask, %active.lane.mask.alias
-  %4 = getelementptr inbounds i16, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %3, <vscale x 8 x i16> poison)
-  %5 = getelementptr inbounds i16, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %3, <vscale x 8 x i16> poison)
-  %6 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
-  %7 = getelementptr inbounds i16, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %6, ptr %7, i32 2, <vscale x 8 x i1> %3)
-  %index.next = add i64 %index, %2
-  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %8 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
-  br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define dso_local void @whilewr_32(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
-; CHECK-LABEL: define dso_local void @whilewr_32(
-; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
-; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
-; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c12 = ptrtoint ptr %c to i64
-  %b13 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %b13, %c12
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
-  %0 = zext <vscale x 4 x i1> %active.lane.mask.alias to <vscale x 4 x i8>
-  %1 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %0)
-  %2 = zext i8 %1 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = and <vscale x 4 x i1> %active.lane.mask, %active.lane.mask.alias
-  %4 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %3, <vscale x 4 x i32> poison)
-  %5 = getelementptr inbounds i32, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %3, <vscale x 4 x i32> poison)
-  %6 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
-  %7 = getelementptr inbounds i32, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %6, ptr %7, i32 4, <vscale x 4 x i1> %3)
-  %index.next = add i64 %index, %2
-  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %8 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
-  br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define dso_local void @whilewr_64(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
-; CHECK-LABEL: define dso_local void @whilewr_64(
-; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-; CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
-; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
-; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c12 = ptrtoint ptr %c to i64
-  %b13 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %b13, %c12
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
-  %0 = zext <vscale x 2 x i1> %active.lane.mask.alias to <vscale x 2 x i8>
-  %1 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %0)
-  %2 = zext i8 %1 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = and <vscale x 2 x i1> %active.lane.mask, %active.lane.mask.alias
-  %4 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %3, <vscale x 2 x i64> poison)
-  %5 = getelementptr inbounds i64, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %3, <vscale x 2 x i64> poison)
-  %6 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
-  %7 = getelementptr inbounds i64, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %6, ptr %7, i32 8, <vscale x 2 x i1> %3)
-  %index.next = add i64 %index, %2
-  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %8 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
-  br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1
-
-declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>) #2
-
-declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>) #3
-
-declare i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8>) #4
-
-declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1
-
-declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>) #2
-
-declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>) #3
-
-declare i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8>) #4
-
-declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1
-
-declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #2
-
-declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #3
-
-declare i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8>) #4
-
-declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1
-
-declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) #2
-
-declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #3
-
-declare i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8>) #4
-
-attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
-attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
-attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

>From 9039996ccad0d1d88ef53de26d0f3da09ddc1a7f Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 27 Aug 2024 17:33:49 +0100
Subject: [PATCH 11/11] Post-rebase clean-up

---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h   |  2 --
 .../Vectorize/LoopVectorizationPlanner.h          |  4 ++--
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   | 13 +++++++------
 llvm/lib/Transforms/Vectorize/VPlan.h             | 15 ---------------
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp    |  2 --
 5 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 5a596bd333ca26..73d9c26ed6b1b7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -16,8 +16,6 @@
 
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include <optional>
 #include <variant>
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8099b95de77e1b..521bce1d4c396c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -26,7 +26,6 @@
 
 #include "VPlan.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
@@ -369,7 +368,8 @@ class LoopVectorizationPlanner {
   /// and the vector loop should be entered even if the pointers alias across a
   /// loop iteration.
   void plan(ElementCount UserVF, unsigned UserIC,
-       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, bool &HasAliasMask);
+            std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
+            bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4d2eccdc7b6b63..aa56a648f92257 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7002,7 +7002,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
     std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
@@ -7012,8 +7013,6 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
-  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
-  // here and put them into a list.
   ArrayRef<PointerDiffInfo> DiffChecks;
   if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
     DiffChecks = *RTChecks;
@@ -10000,11 +9999,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   bool AddBranchWeights =
       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                           F->getDataLayout(), AddBranchWeights);
+  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(),
+                           AddBranchWeights);
 
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Checks.HasAliasMask);
+  LVP.plan(UserVF, UserIC,
+           LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+           Checks.HasAliasMask);
   VectorizationFactor VF = LVP.computeBestVF();
   if (Checks.HasAliasMask)
     LoopsAliasMasked++;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a8f95f04ed389..2a924b5173da5d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -43,7 +43,6 @@
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/InstructionCost.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -3882,20 +3881,6 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
 bool isHeaderMask(const VPValue *V, VPlan &Plan);
 } // end namespace vputils
 
-/// A pair of pointers that could overlap across a loop iteration.
-struct PointerDiffInfoValues {
-  /// The pointer being read from
-  Value *Src;
-  /// The pointer being stored to
-  Value *Sink;
-
-  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
-                        SCEVExpander Exp, Instruction *Loc)
-      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
-        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
-  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
-};
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 51a2b45f94cc29..e837ee6021402b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -18,11 +18,9 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"