[clang] [llvm] [LV] Mask off possibly aliasing vector lanes (PR #100579)

Wed Aug 14 09:23:47 PDT 2024

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/100579

>From 5019914e445a813b55ed5d5d087e4a4008ef15b2 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 26 Jun 2024 09:55:45 +0100
Subject: [PATCH 1/9] [LV] Mask off possibly aliasing vector lanes

When vectorising a loop that uses loads and stores, those pointers could
overlap if their difference is less than the vector factor. For example,
if address 20 is being stored to and address 23 is being loaded from, they
overlap when the vector factor is 4 or higher. Currently LoopVectorize
branches to a scalar loop in these cases with a runtime check. Howver if
we construct a mask that disables the overlapping (aliasing) lanes then
the vectorised loop can be safely entered, as long as the loads and
stores are masked off.

This PR modifies the LoopVectorizer and VPlan to create such a mask and
always branch to the vector loop. Currently this is only done if we're
tail-predicating, but more work will come in the future to do this in
other cases as well.
---
 clang/test/CodeGen/loop-alias-mask.c          | 404 ++++++++++++++++++
 .../llvm/Analysis/LoopAccessAnalysis.h        |  16 +-
 .../Vectorize/LoopVectorizationPlanner.h      |  24 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 143 +++++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |   2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  91 +++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  31 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |   5 +-
 .../LoopVectorize/AArch64/whilewr-opt.ll      | 369 ++++++++++++++++
 .../runtime-check-small-clamped-bounds.ll     |  22 +-
 .../runtime-checks-difference.ll              | 102 ++---
 11 files changed, 1102 insertions(+), 107 deletions(-)
 create mode 100644 clang/test/CodeGen/loop-alias-mask.c
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll

diff --git a/clang/test/CodeGen/loop-alias-mask.c b/clang/test/CodeGen/loop-alias-mask.c
new file mode 100644
index 00000000000000..76c3b5deddfa09
--- /dev/null
+++ b/clang/test/CodeGen/loop-alias-mask.c
@@ -0,0 +1,404 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang --target=aarch64-linux-gnu -march=armv9+sme2 -emit-llvm -S -g0 -O3 -mllvm -prefer-predicate-over-epilogue=predicate-dont-vectorize %s -o - | FileCheck %s
+#include <stdint.h>
+
+// CHECK-LABEL: define dso_local void @alias_mask_8(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]]), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_8(uint8_t *restrict a, uint8_t * b, uint8_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_16(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]]), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_16(uint16_t *restrict a, uint16_t * b, uint16_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_32(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]]), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_32(uint32_t *restrict a, uint32_t * b, uint32_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_64(
+// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]]), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_64(uint64_t *restrict a, uint64_t * b, uint64_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_8(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
+// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], 0
+// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT21]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF18]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 16 x i1> [[TMP0]] to <vscale x 16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP7]], ptr [[TMP8]], i32 1, <vscale x 16 x i1> [[TMP4]]), !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_8(uint8_t * a, uint8_t * b, uint8_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_16(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
+// CHECK-NEXT:    [[DIFF19:%.*]] = sdiv i64 [[SUB_DIFF18]], 2
+// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], -1
+// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT21]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF19]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 8 x i1> [[TMP0]] to <vscale x 8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP6]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP7]], ptr [[TMP8]], i32 2, <vscale x 8 x i1> [[TMP4]]), !tbaa [[TBAA13]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_16(uint16_t * a, uint16_t * b, uint16_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_32(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
+// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 4
+// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -3
+// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT19]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF17]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 4 x i1> [[TMP0]] to <vscale x 4 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP4]]), !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_32(uint32_t * a, uint32_t * b, uint32_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+// CHECK-LABEL: define dso_local void @alias_mask_multiple_64(
+// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.body.preheader:
+// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
+// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
+// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
+// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
+// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 8
+// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -7
+// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT19]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF17]])
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 2 x i1> [[TMP0]] to <vscale x 2 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
+// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+// CHECK:       vector.body:
+// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[TMP4]]), !tbaa [[TBAA19]]
+// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
+// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    ret void
+//
+void alias_mask_multiple_64(uint64_t * a, uint64_t * b, uint64_t * c, int n) {
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < n; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index afafb74bdcb0ac..5ad5015b460d9d 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -16,8 +16,8 @@
 
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <optional>
 #include <variant>
 
@@ -448,6 +448,20 @@ struct PointerDiffInfo {
         NeedsFreeze(NeedsFreeze) {}
 };
 
+/// A pair of pointers that could overlap across a loop iteration.
+struct PointerDiffInfoValues {
+  /// The pointer being read from
+  Value *Src;
+  /// The pointer being stored to
+  Value *Sink;
+
+  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
+                        SCEVExpander Exp, Instruction *Loc)
+      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
+        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
+  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
+};
+
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c63cf0c37f2f97..67720c2fa8f693 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -26,6 +26,7 @@
 
 #include "VPlan.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
@@ -356,7 +357,13 @@ class LoopVectorizationPlanner {
 
   /// Plan how to best vectorize, return the best VF and its cost, or
   /// std::nullopt if vectorization and interleaving should be avoided up front.
-  std::optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  std::optional<VectorizationFactor>
+  plan(ElementCount UserVF, unsigned UserIC,
+       SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -429,12 +436,23 @@ class LoopVectorizationPlanner {
   /// returned VPlan is valid for. If no VPlan can be built for the input range,
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built.
-  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
+  /// RTChecks is a list of pointer pairs that should be checked for aliasing,
+  /// setting HasAliasMask to true in the case that an alias mask is generated
+  /// and the vector loop should be entered even if the pointers alias across a
+  /// loop iteration.
+  VPlanPtr
+  tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                               SmallVector<PointerDiffInfoValues> RTChecks,
+                               bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  /// RTChecks contains a list of pointer pairs that an alias mask should be
+  /// generated for.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
+                                SmallVector<PointerDiffInfoValues> RTChecks,
+                                bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b709ccc3c35a7d..5ab8b61595d6c2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1829,6 +1829,10 @@ class GeneratedRTChecks {
   Loop *OuterLoop = nullptr;
 
 public:
+  /// Set by VPlan when the vector loop should be entered even when runtime
+  /// checks determine that pointers alias within an iteration.
+  bool HasAliasMask = false;
+
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
                     bool AddBranchWeights)
@@ -1869,9 +1873,11 @@ class GeneratedRTChecks {
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
     if (RtPtrChecking.Need) {
-      auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
-      MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
-                                 "vector.memcheck");
+      if (!MemCheckBlock) {
+        auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
+        MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
+                                   "vector.memcheck");
+      }
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
       if (DiffChecks) {
@@ -1929,6 +1935,10 @@ class GeneratedRTChecks {
     OuterLoop = L->getParentLoop();
   }
 
+  Value *expandCodeForMemCheck(const SCEV *Scev, Instruction *Loc) {
+    return MemCheckExp.expandCodeFor(Scev, Scev->getType(), Loc);
+  }
+
   InstructionCost getCost() {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
@@ -2103,11 +2113,18 @@ class GeneratedRTChecks {
     if (OuterLoop)
       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
 
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
-    if (AddBranchWeights) {
+    // TODO: Branch to the vector preheader conditionally based on the number of
+    // non-aliasing elements. The scalar loop will likely be better if only one
+    // or two elements will be processed per vectorised loop iteration.
+
+    // Jump to the vector preheader unconditionally if it's safe to do so
+    // because an alias mask has been set up.
+    BranchInst &BI = HasAliasMask
+                         ? *BranchInst::Create(LoopVectorPreHeader)
+                         : *BranchInst::Create(Bypass, LoopVectorPreHeader,
+                                               MemRuntimeCheckCond);
+    if (!HasAliasMask && AddBranchWeights)
       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
-    }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
         Pred->getTerminator()->getDebugLoc());
@@ -2576,7 +2593,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
+  /// If an alias mask has been set up then we don't need the bypass as the
+  /// vector preheader will be branched to unconditionally
+  if (!RTChecks.HasAliasMask)
+    LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -6885,7 +6905,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 }
 
 std::optional<VectorizationFactor>
-LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
+                               SmallVector<PointerDiffInfoValues> RTChecks,
+                               bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6922,7 +6944,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     CM.collectInLoopReductions();
     if (CM.selectUserVectorizationFactor(UserVF)) {
       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-      buildVPlansWithVPRecipes(UserVF, UserVF);
+      buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
       if (!hasPlanWithVF(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
                           << ".\n");
@@ -6956,8 +6978,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
+                           RTChecks, HasAliasMask);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
+                           RTChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
   if (VPlans.empty())
@@ -7383,7 +7407,6 @@ LoopVectorizationPlanner::executePlan(
                              CanonicalIVStartValue, State);
 
   BestVPlan.execute(&State);
-
   // 2.5 Collect reduction resume values.
   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
   auto *ExitVPBB =
@@ -7627,7 +7650,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   // reduction phis in the scalar loop preheader.
   if (EPI.SCEVSafetyCheck)
     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
+  if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
@@ -7848,6 +7871,7 @@ void VPRecipeBuilder::createHeaderMask() {
   // constructing the desired canonical IV in the header block as its first
   // non-phi instructions.
 
+  VPValue *BlockMask = nullptr;
   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
@@ -7855,7 +7879,6 @@ void VPRecipeBuilder::createHeaderMask() {
 
   VPBuilder::InsertPointGuard Guard(Builder);
   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
-  VPValue *BlockMask = nullptr;
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   BlockMaskCache[Header] = BlockMask;
@@ -8350,14 +8373,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    ElementCount MinVF, ElementCount MaxVF,
+    SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+    if (auto Plan =
+            tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
       // Now optimize the initial VPlan.
       if (!Plan->hasVF(ElementCount::getFixed(1)))
         VPlanTransforms::truncateToMinimalBitwidths(
@@ -8378,7 +8403,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
 // Add the necessary canonical IV and branch recipes required to control the
 // loop.
 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
-                                  DebugLoc DL) {
+                                  DebugLoc DL, VPValue *AliasMask) {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
 
@@ -8389,9 +8414,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
   Header->insert(CanonicalIVPHI, Header->begin());
 
   VPBuilder Builder(TopRegion->getExitingBasicBlock());
-  // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+  // Add a VPInstruction to increment the scalar canonical IV by VF * UF, or the
+  // popcount of the alias mask if there is one
+  VPValue *IncrementBy = &Plan.getVFxUF();
+  if (AliasMask) {
+    IncrementBy = Builder.createNaryOp(VPInstruction::PopCount, {AliasMask}, DL,
+                                       "popcount");
+    auto *IVType = CanonicalIVPHI->getScalarType();
+
+    if (IVType->getScalarSizeInBits() < 64) {
+      auto *Cast =
+          new VPScalarCastRecipe(Instruction::Trunc, IncrementBy, IVType);
+      Cast->insertAfter(IncrementBy->getDefiningRecipe());
+      IncrementBy = Cast;
+    }
+  }
+
   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
-      Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
+      Instruction::Add, {CanonicalIVPHI, IncrementBy}, {HasNUW, false}, DL,
       "index.next");
   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
 
@@ -8480,8 +8520,9 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
   }
 }
 
-VPlanPtr
-LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+    VFRange &Range, SmallVector<PointerDiffInfoValues> RTChecks,
+    bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -8520,7 +8561,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // When not folding the tail, we know that the induction increment will not
   // overflow.
   bool HasNUW = Style == TailFoldingStyle::None;
-  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+
+  VPValue *AliasMask = nullptr;
+  if (useActiveLaneMask(Style)) {
+    // Create an alias mask for each possibly-aliasing pointer pair. If there
+    // are multiple they are combined together with ANDs.
+    VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+    auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
+    VPBuilder Builder(VecPreheader);
+    for (auto C : RTChecks) {
+      HasAliasMask = true;
+      VPValue *Sink = Plan->getOrAddLiveIn(C.Sink);
+      VPValue *Src = Plan->getOrAddLiveIn(C.Src);
+      VPValue *M =
+          Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
+                               "active.lane.mask.alias");
+      if (AliasMask)
+        AliasMask = Builder.createAnd(AliasMask, M);
+      else
+        AliasMask = M;
+    }
+  }
+  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL,
+                        AliasMask);
 
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
 
@@ -8737,7 +8800,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     bool WithoutRuntimeCheck =
         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
-                                       WithoutRuntimeCheck);
+                                       WithoutRuntimeCheck, AliasMask);
   }
   return Plan;
 }
@@ -8777,7 +8840,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   // is guaranteed to not wrap.
   bool HasNUW = true;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
-                        DebugLoc());
+                        DebugLoc(), nullptr);
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
@@ -9516,6 +9579,7 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+
   return true;
 }
 
@@ -9838,16 +9902,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  bool AddBranchWeights =
+      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(),
+                           AddBranchWeights);
+
+  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
+  // here and put them into a list.
+  std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
+      LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
+  SmallVector<PointerDiffInfoValues> DiffChecksValues;
+  if (DiffChecks.has_value() &&
+      useActiveLaneMask(CM.getTailFoldingStyle(true))) {
+    Instruction *Loc = L->getLoopPreheader()->getTerminator();
+    for (auto Check : *DiffChecks) {
+      Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
+      Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
+      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
+    }
+  }
+
   // Plan how to best vectorize, return the best VF and its cost.
-  std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
+  std::optional<VectorizationFactor> MaybeVF =
+      LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
-  bool AddBranchWeights =
-      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                           F->getDataLayout(), AddBranchWeights);
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0b596e7e4f633c..e850c4ff705a5d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1239,6 +1239,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    AliasLaneMask,
     ExplicitVectorLength,
     /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
     /// The first operand is the incoming value from the predecessor in VPlan,
@@ -1258,6 +1259,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     // scalar.
     ExtractFromEnd,
     LogicalAnd, // Non-poison propagating logical And.
+    PopCount,
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d3..c48fc71d156cfc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -18,9 +18,11 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
@@ -363,6 +365,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::PopCount:
     return true;
   default:
     return false;
@@ -428,6 +431,85 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
+  case VPInstruction::AliasLaneMask: {
+    // Given a pointer A that is being stored to, and pointer B that is being
+    // read from, both with unknown lengths, create a mask that disables
+    // elements which could overlap across a loop iteration. For example, if A
+    // is X and B is X + 2 with VF being 4, only the final two elements of the
+    // loaded vector can be stored since they don't overlap with the stored
+    // vector. %b.vec = load %b ; = [s, t, u, v]
+    // [...]
+    // store %a, %b.vec ; only u and v can be stored as their addresses don't
+    // overlap with %a + (VF - 1)
+    Value *ReadPtr = State.get(getOperand(0), VPIteration(Part, 0));
+    Value *StorePtr = State.get(getOperand(1), VPIteration(Part, 0));
+    unsigned ElementSize = 0;
+
+    // We expect the operands to the alias mask to be ptrtoint. Sometimes it's
+    // an add of a ptrtoint.
+    auto *ReadInsn = cast<Instruction>(ReadPtr);
+    auto *ReadCast = dyn_cast<CastInst>(ReadPtr);
+    if (ReadInsn->getOpcode() == Instruction::Add)
+      ReadCast = dyn_cast<CastInst>(ReadInsn->getOperand(0));
+
+    if (ReadCast && ReadCast->getOpcode() == Instruction::PtrToInt) {
+      Value *Ptr = ReadCast->getOperand(0);
+      for (auto *Use : Ptr->users()) {
+        if (auto *GEP = dyn_cast<GetElementPtrInst>(Use)) {
+          auto *EltVT = GEP->getSourceElementType();
+          if (EltVT->isArrayTy())
+            ElementSize = EltVT->getArrayElementType()->getScalarSizeInBits() *
+                          EltVT->getArrayNumElements();
+          else
+            ElementSize =
+                GEP->getSourceElementType()->getScalarSizeInBits() / 8;
+          break;
+        }
+      }
+    }
+    assert(ElementSize > 0 && "Couldn't get element size from pointer");
+    // Calculate how many elements the pointers differ by
+    Value *Diff = Builder.CreateSub(StorePtr, ReadPtr, "sub.diff");
+    auto *Type = Diff->getType();
+    Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+    Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+    // If the difference is negative then some elements may alias
+    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, DiffDiv,
+                                    ConstantInt::get(Type, 0), "neg.compare");
+    // Splat the compare result then OR it with a lane mask
+    Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+    Value *DiffMask = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask,
+        {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+        {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+    return Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat,
+                               Name);
+  }
+  // Count the number of bits set in each lane and reduce the result to a scalar
+  case VPInstruction::PopCount: {
+    if (Part != 0)
+      return State.get(this, 0, /*IsScalar*/ true);
+    Value *Op = State.get(getOperand(0), Part);
+    auto *VT = Op->getType();
+    Value *Cnt = Op;
+
+    // i1 vectors can just use the add reduction. Bigger elements need a ctpop
+    // first.
+    if (VT->getScalarSizeInBits() > 1)
+      Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});
+
+    auto *VecVT = cast<VectorType>(VT);
+    // Extend to an i8 since i1 is too small to add with
+    if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
+      Cnt = Builder.CreateCast(
+          Instruction::ZExt, Cnt,
+          VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
+    }
+
+    Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
+    Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
+    return Cnt;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     // Generate code to combine the previous and current values in vector v3.
     //
@@ -681,7 +763,8 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == PopCount;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -818,6 +901,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
+  case VPInstruction::AliasLaneMask:
+    O << "alias lane mask";
+    break;
+  case VPInstruction::PopCount:
+    O << "popcount";
+    break;
   case VPInstruction::ExplicitVectorLength:
     O << "EXPLICIT-VECTOR-LENGTH";
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e311..d5795734c91e9d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1256,8 +1256,10 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
 //   %Negated = Not %ALM
 //   branch-on-cond %Negated
 //
-static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
-    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+static VPValue *
+addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan,
+                                    bool DataAndControlFlowWithoutRuntimeCheck,
+                                    VPValue *AliasMask) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -1298,14 +1300,22 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
+  VPValue *Mask =
       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
                            DL, "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(Mask, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
+  VPValue *LaneMask = LaneMaskPhi;
+  if (AliasMask) {
+    // And the alias mask so the iteration only processed non-aliasing lanes
+    Builder.setInsertPoint(CanonicalIVPHI->getParent(),
+                           CanonicalIVPHI->getParent()->getFirstNonPhi());
+    LaneMask = Builder.createNaryOp(Instruction::BinaryOps::And,
+                                    {LaneMaskPhi, AliasMask}, DL);
+  }
 
   // Create the active lane mask for the next iteration of the loop before the
   // original terminator.
@@ -1324,7 +1334,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   auto *NotMask = Builder.createNot(ALM, DL);
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
-  return LaneMaskPhi;
+  return LaneMask;
 }
 
 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
@@ -1374,23 +1384,24 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
 
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
-    bool DataAndControlFlowWithoutRuntimeCheck) {
+    bool DataAndControlFlowWithoutRuntimeCheck, VPValue *AliasMask) {
+
   assert((!DataAndControlFlowWithoutRuntimeCheck ||
           UseActiveLaneMaskForControlFlow) &&
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto FoundWidenCanonicalIVUser =
+  auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
-  assert(FoundWidenCanonicalIVUser &&
+  assert(FoundWidenCanonicalIVUser && *FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPSingleDefRecipe *LaneMask;
+  VPValue *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
-        Plan, DataAndControlFlowWithoutRuntimeCheck);
+        Plan, DataAndControlFlowWithoutRuntimeCheck, AliasMask);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..e82226ac813c7f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -73,9 +73,12 @@ struct VPlanTransforms {
   /// creation) and instead it is handled using active-lane-mask. \p
   /// DataAndControlFlowWithoutRuntimeCheck implies \p
   /// UseActiveLaneMaskForControlFlow.
+  /// RTChecks refers to the pointer pairs that need aliasing elements to be
+  /// masked off each loop iteration.
   static void addActiveLaneMask(VPlan &Plan,
                                 bool UseActiveLaneMaskForControlFlow,
-                                bool DataAndControlFlowWithoutRuntimeCheck);
+                                bool DataAndControlFlowWithoutRuntimeCheck,
+                                VPValue *AliasMask);
 
   /// Insert truncates and extends for any truncated recipe. Redundant casts
   /// will be folded later.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll
new file mode 100644
index 00000000000000..b3fb78df060820
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/whilewr-opt.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -mtriple=aarch64-linux-gnu -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
+define dso_local void @whilewr_8(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_8(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b15, %c14
+  %neg.compare = icmp slt i64 %sub.diff, 0
+  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
+  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i8, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+  %5 = getelementptr inbounds i8, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
+  %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
+  %7 = getelementptr inbounds i8, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_16(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_16(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c14 = ptrtoint ptr %c to i64
+  %b15 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b15, %c14
+  %diff = sdiv i64 %sub.diff, 2
+  %neg.compare = icmp slt i64 %sub.diff, -1
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 8 x i1> %active.lane.mask.alias to <vscale x 8 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 8 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %3, <vscale x 8 x i16> poison)
+  %5 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %3, <vscale x 8 x i16> poison)
+  %6 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
+  %7 = getelementptr inbounds i16, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %6, ptr %7, i32 2, <vscale x 8 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_32(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_32(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c12 = ptrtoint ptr %c to i64
+  %b13 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b13, %c12
+  %diff = sdiv i64 %sub.diff, 4
+  %neg.compare = icmp slt i64 %sub.diff, -3
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 4 x i1> %active.lane.mask.alias to <vscale x 4 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 4 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %3, <vscale x 4 x i32> poison)
+  %5 = getelementptr inbounds i32, ptr %b, i64 %index
+  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %3, <vscale x 4 x i32> poison)
+  %6 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
+  %7 = getelementptr inbounds i32, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %6, ptr %7, i32 4, <vscale x 4 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define dso_local void @whilewr_64(ptr noalias nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c, i32 noundef %n) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @whilewr_64(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
+; CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
+; CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %c12 = ptrtoint ptr %c to i64
+  %b13 = ptrtoint ptr %b to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %sub.diff = sub i64 %b13, %c12
+  %diff = sdiv i64 %sub.diff, 8
+  %neg.compare = icmp slt i64 %sub.diff, -7
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
+  %0 = zext <vscale x 2 x i1> %active.lane.mask.alias to <vscale x 2 x i8>
+  %1 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %0)
+  %2 = zext i8 %1 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %3 = and <vscale x 2 x i1> %active.lane.mask, %active.lane.mask.alias
+  %4 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %3, <vscale x 2 x i64> poison)
+  %5 = getelementptr inbounds i64, ptr %b, i64 %index
+  %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %3, <vscale x 2 x i64> poison)
+  %6 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
+  %7 = getelementptr inbounds i64, ptr %c, i64 %index
+  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %6, ptr %7, i32 8, <vscale x 2 x i1> %3)
+  %index.next = add i64 %index, %2
+  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %8 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
+  br i1 %8, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1
+
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>) #2
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8>) #4
+
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1
+
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>) #2
+
+declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8>) #4
+
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1
+
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #2
+
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8>) #4
+
+declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1
+
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) #2
+
+declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #3
+
+declare i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8>) #4
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index e7e63e55802fe1..53ef470e098d0d 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -185,18 +185,18 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i32 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index 55bbf54d1f39d7..5760015b63fac8 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -9,11 +9,11 @@ define void @same_step_and_size(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -39,11 +39,11 @@ define void @same_step_and_size_no_dominance_between_accesses(ptr %a, ptr %b, i6
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -76,16 +76,16 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-LABEL: @different_steps_and_different_access_sizes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    [[N_SHL_1:%.]] = shl i64 %n, 1
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr %a, i64 [[N_SHL_1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -112,12 +112,12 @@ define void @steps_match_but_different_access_sizes_1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[B1]], -2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -146,12 +146,12 @@ define void @steps_match_but_different_access_sizes_2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[A1]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:
   br label %loop
@@ -177,25 +177,27 @@ exit:
 ; Test case for PR57315.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
+
+
 entry:
   br label %outer.header
 
@@ -227,25 +229,27 @@ exit:
 ; sink and source swapped.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner2(
-; CHECK:        entry:
-; CHECK-NEXT:    [[N_SHL_2:%.]] = shl i64 %n, 2
-; CHECK-NEXT:    [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]]
-; CHECK-NEXT:    br label %outer
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK:         [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2
-; CHECK-NEXT:    [[A_GEP_UPPER:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_SHL_2]]
-; CHECK-NEXT:    [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4
-; CHECK-NEXT:    [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]]
-; CHECK:         [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
-
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
+
+
 entry:
   br label %outer.header
 
@@ -281,15 +285,15 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
 ; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N:%.*]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ;
 entry:

>From f6b3fbb70a8017f75e50c91a082817cf2ba1628b Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 5 Aug 2024 13:43:08 +0100
Subject: [PATCH 2/9] Move PointerDiffInfoValues to VPlan.h

---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 14 --------------
 llvm/lib/Transforms/Vectorize/VPlan.h           | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 5ad5015b460d9d..646612bcb3cfe3 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -448,20 +448,6 @@ struct PointerDiffInfo {
         NeedsFreeze(NeedsFreeze) {}
 };
 
-/// A pair of pointers that could overlap across a loop iteration.
-struct PointerDiffInfoValues {
-  /// The pointer being read from
-  Value *Src;
-  /// The pointer being stored to
-  Value *Sink;
-
-  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
-                        SCEVExpander Exp, Instruction *Loc)
-      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
-        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
-  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
-};
-
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e850c4ff705a5d..aa7c29a82d28c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3827,6 +3827,20 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
 bool isHeaderMask(VPValue *V, VPlan &Plan);
 } // end namespace vputils
 
+/// A pair of pointers that could overlap across a loop iteration.
+struct PointerDiffInfoValues {
+  /// The pointer being read from
+  Value *Src;
+  /// The pointer being stored to
+  Value *Sink;
+
+  PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
+                        SCEVExpander Exp, Instruction *Loc)
+      : Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
+        Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
+  PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
+};
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H

>From 2a351ed7ee51d58dc9abc398ea57f68800791194 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 6 Aug 2024 13:48:17 +0100
Subject: [PATCH 3/9] Move SCEV expansion so it's done after we know if we're
 tail-folding or not

---
 .../Vectorize/LoopVectorizationPlanner.h      |  2 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 42 +++++++++----------
 .../AArch64/induction-costs-sve.ll            | 21 +++++++---
 .../LoopVectorize/ARM/scalar-block-cost.ll    |  4 +-
 4 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 67720c2fa8f693..00dea303d43b9d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -363,7 +363,7 @@ class LoopVectorizationPlanner {
   /// loop iteration.
   std::optional<VectorizationFactor>
   plan(ElementCount UserVF, unsigned UserIC,
-       SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
+       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, std::function<Value*(const SCEV *)> Expander, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ab8b61595d6c2..5d0bcf2fba9659 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6906,8 +6906,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 std::optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
-                               SmallVector<PointerDiffInfoValues> RTChecks,
-                               bool &HasAliasMask) {
+                               std::optional<ArrayRef<PointerDiffInfo>> RTChecks,
+                               std::function<Value*(const SCEV*)> Expander, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6916,6 +6916,18 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return std::nullopt;
 
+  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
+  // here and put them into a list.
+  SmallVector<PointerDiffInfoValues> DiffChecksValues;
+  if (RTChecks.has_value()
+      && useActiveLaneMask(CM.getTailFoldingStyle(true))) {
+    for (auto Check : *RTChecks) {
+      Value *Sink = Expander(Check.SinkStart);
+      Value *Src = Expander(Check.SrcStart);
+      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
+    }
+  }
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -6944,7 +6956,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
     CM.collectInLoopReductions();
     if (CM.selectUserVectorizationFactor(UserVF)) {
       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-      buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
+      buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecksValues, HasAliasMask);
       if (!hasPlanWithVF(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
                           << ".\n");
@@ -6979,9 +6991,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           RTChecks, HasAliasMask);
+                           DiffChecksValues, HasAliasMask);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           RTChecks, HasAliasMask);
+                           DiffChecksValues, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
   if (VPlans.empty())
@@ -9907,24 +9919,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(),
                            AddBranchWeights);
 
-  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
-  // here and put them into a list.
-  std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
-      LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
-  SmallVector<PointerDiffInfoValues> DiffChecksValues;
-  if (DiffChecks.has_value() &&
-      useActiveLaneMask(CM.getTailFoldingStyle(true))) {
-    Instruction *Loc = L->getLoopPreheader()->getTerminator();
-    for (auto Check : *DiffChecks) {
-      Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
-      Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
-      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
-    }
-  }
-
   // Plan how to best vectorize, return the best VF and its cost.
+  auto Expand = [&Checks, &L](const SCEV *S) {
+    return Checks.expandCodeForMemCheck(S, L->getLoopPreheader()->getTerminator());
+  };
   std::optional<VectorizationFactor> MaybeVF =
-      LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
+      LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index edba5ee1d7f9eb..b8053bb31b58e3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -153,7 +153,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -163,6 +163,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[DIFF]], 0
+; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -177,9 +184,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
 ; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
 ; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -188,8 +196,11 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
+; PRED-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
+; PRED-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
@@ -197,7 +208,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e9f094de..b381392ebf9f16 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -9,8 +9,8 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add = add nuw nsw i32 %i.09, 1
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %data, i32 %add
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i32, ptr %arrayidx, align 4
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %0, 5
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i32, ptr %arrayidx, align 4
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %1, 5
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %i.09
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add1, ptr %arrayidx2, align 4
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %add, %n

>From b13aac9e6082f34d61b9f74fb8975aa0de34e3fb Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 10:29:17 +0100
Subject: [PATCH 4/9] Add statistic variable

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5d0bcf2fba9659..a48efc3dea6b81 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -178,6 +178,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -9925,6 +9926,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   };
   std::optional<VectorizationFactor> MaybeVF =
       LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
+  if (Checks.HasAliasMask)
+    LoopsAliasMasked++;
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;

>From 00a492d22fd8365fe5ed1e4590203ca4b8306656 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 16:33:17 +0100
Subject: [PATCH 5/9] Remove SCEV expander header include move

---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 646612bcb3cfe3..afafb74bdcb0ac 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -16,8 +16,8 @@
 
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <optional>
 #include <variant>
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index aa7c29a82d28c2..49ac2a42fbca8f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -43,6 +43,7 @@
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/InstructionCost.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>

>From 93ef1564da690fb35428f6d6f1336c85eb4a4b3c Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 16:39:58 +0100
Subject: [PATCH 6/9] Remove unnecessary changes from
 runtime-checks-difference.ll

---
 .../runtime-checks-difference.ll              | 52 +++++++++----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index 5760015b63fac8..61767c96c97cca 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -9,11 +9,11 @@ define void @same_step_and_size(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -39,11 +39,11 @@ define void @same_step_and_size_no_dominance_between_accesses(ptr %a, ptr %b, i6
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -76,16 +76,16 @@ define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n)
 ; CHECK-LABEL: @different_steps_and_different_access_sizes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 %n, 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr %a, i64 [[TMP1]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -112,12 +112,12 @@ define void @steps_match_but_different_access_sizes_1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[B1]], -2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -146,12 +146,12 @@ define void @steps_match_but_different_access_sizes_2(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[A1]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph
 ;
 entry:
   br label %loop
@@ -177,10 +177,11 @@ exit:
 ; Test case for PR57315.
 define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[N:%.*]], 2
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:        entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 %n, 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr %b, i64 [[TMP0]]
+; CHECK-NEXT:    br label %outer
+
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2
@@ -194,10 +195,8 @@ define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
-
-
 entry:
   br label %outer.header
 
@@ -241,15 +240,14 @@ define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck
+
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph
 ;
-
-
 entry:
   br label %outer.header
 
@@ -289,7 +287,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
 ; CHECK:       outer.loop:
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:

>From a1b0669d6a393c13cdca78c00ff4386617b93e59 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 7 Aug 2024 16:40:58 +0100
Subject: [PATCH 7/9] Remove clang test

---
 clang/test/CodeGen/loop-alias-mask.c | 404 ---------------------------
 1 file changed, 404 deletions(-)
 delete mode 100644 clang/test/CodeGen/loop-alias-mask.c

diff --git a/clang/test/CodeGen/loop-alias-mask.c b/clang/test/CodeGen/loop-alias-mask.c
deleted file mode 100644
index 76c3b5deddfa09..00000000000000
--- a/clang/test/CodeGen/loop-alias-mask.c
+++ /dev/null
@@ -1,404 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang --target=aarch64-linux-gnu -march=armv9+sme2 -emit-llvm -S -g0 -O3 -mllvm -prefer-predicate-over-epilogue=predicate-dont-vectorize %s -o - | FileCheck %s
-#include <stdint.h>
-
-// CHECK-LABEL: define dso_local void @alias_mask_8(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP4]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP3]]), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP9:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_8(uint8_t *restrict a, uint8_t * b, uint8_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_16(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B15:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B15]], [[C14]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP4]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD16]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP6]], ptr [[TMP7]], i32 2, <vscale x 8 x i1> [[TMP3]]), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP15:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_16(uint16_t *restrict a, uint16_t * b, uint16_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_32(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 4 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP4]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP3]]), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP18:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_32(uint32_t *restrict a, uint32_t * b, uint32_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_64(
-// CHECK-SAME: ptr noalias nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[B13:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[B13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 2 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19:![0-9]+]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[TMP3]]), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP21:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_64(uint64_t *restrict a, uint64_t * b, uint64_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_8(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], 0
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
-// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], 0
-// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 16 x i1> [[DOTSPLATINSERT21]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[SUB_DIFF18]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 16 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 16 x i1> [[TMP0]] to <vscale x 16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i8> poison), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP7]], ptr [[TMP8]], i32 1, <vscale x 16 x i1> [[TMP4]]), !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP22:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_8(uint8_t * a, uint8_t * b, uint8_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_16(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C14:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A15:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B16:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A15]], [[C14]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 2
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -1
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF18:%.*]] = sub i64 [[B16]], [[C14]]
-// CHECK-NEXT:    [[DIFF19:%.*]] = sdiv i64 [[SUB_DIFF18]], 2
-// CHECK-NEXT:    [[NEG_COMPARE20:%.*]] = icmp slt i64 [[SUB_DIFF18]], -1
-// CHECK-NEXT:    [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE20]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT22:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT21]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK23:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF19]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS24:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK23]], [[DOTSPLAT22]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS24]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 8 x i1> [[TMP0]] to <vscale x 8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP5]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP6]], i32 2, <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> poison), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i16> [[WIDE_MASKED_LOAD25]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[TMP7]], ptr [[TMP8]], i32 2, <vscale x 8 x i1> [[TMP4]]), !tbaa [[TBAA13]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP23:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_16(uint16_t * a, uint16_t * b, uint16_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_32(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 4
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -3
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
-// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 4
-// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -3
-// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT19]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[DIFF17]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 4 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 4 x i1> [[TMP0]] to <vscale x 4 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> poison), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP4]]), !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP24:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_32(uint32_t * a, uint32_t * b, uint32_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-// CHECK-LABEL: define dso_local void @alias_mask_multiple_64(
-// CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], ptr nocapture noundef writeonly [[C:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-// CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.body.preheader:
-// CHECK-NEXT:    [[C12:%.*]] = ptrtoint ptr [[C]] to i64
-// CHECK-NEXT:    [[A13:%.*]] = ptrtoint ptr [[A]] to i64
-// CHECK-NEXT:    [[B14:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
-// CHECK-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[A13]], [[C12]]
-// CHECK-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 8
-// CHECK-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[SUB_DIFF]], -7
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
-// CHECK-NEXT:    [[SUB_DIFF16:%.*]] = sub i64 [[B14]], [[C12]]
-// CHECK-NEXT:    [[DIFF17:%.*]] = sdiv i64 [[SUB_DIFF16]], 8
-// CHECK-NEXT:    [[NEG_COMPARE18:%.*]] = icmp slt i64 [[SUB_DIFF16]], -7
-// CHECK-NEXT:    [[DOTSPLATINSERT19:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[NEG_COMPARE18]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT20:%.*]] = shufflevector <vscale x 2 x i1> [[DOTSPLATINSERT19]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-// CHECK-NEXT:    [[PTR_DIFF_LANE_MASK21:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[DIFF17]])
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ALIAS22:%.*]] = or <vscale x 2 x i1> [[PTR_DIFF_LANE_MASK21]], [[DOTSPLAT20]]
-// CHECK-NEXT:    [[TMP0:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK_ALIAS]], [[ACTIVE_LANE_MASK_ALIAS22]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext <vscale x 2 x i1> [[TMP0]] to <vscale x 2 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-// CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-// CHECK:       vector.body:
-// CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-// CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], [[TMP0]]
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-// CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> poison), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[INDEX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[TMP4]]), !tbaa [[TBAA19]]
-// CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
-// CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-// CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP25:![0-9]+]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    ret void
-//
-void alias_mask_multiple_64(uint64_t * a, uint64_t * b, uint64_t * c, int n) {
-  #pragma clang loop vectorize(enable)
-  for (int i = 0; i < n; i++) {
-    c[i] = a[i] + b[i];
-  }
-}

>From 7a87bf41169c2ae2a596227fe9679ce78141a933 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 8 Aug 2024 16:17:50 +0100
Subject: [PATCH 8/9] Use VPExpandSCEVRecipe

---
 .../Vectorize/LoopVectorizationPlanner.h      | 11 ++---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 49 +++++++------------
 .../AArch64/induction-costs-sve.ll            |  4 +-
 .../LoopVectorize/ARM/scalar-block-cost.ll    |  4 +-
 4 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00dea303d43b9d..037910a8d1a2f1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -363,7 +363,7 @@ class LoopVectorizationPlanner {
   /// loop iteration.
   std::optional<VectorizationFactor>
   plan(ElementCount UserVF, unsigned UserIC,
-       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, std::function<Value*(const SCEV *)> Expander, bool &HasAliasMask);
+       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -440,10 +440,9 @@ class LoopVectorizationPlanner {
   /// setting HasAliasMask to true in the case that an alias mask is generated
   /// and the vector loop should be entered even if the pointers alias across a
   /// loop iteration.
-  VPlanPtr
-  tryToBuildVPlanWithVPRecipes(VFRange &Range,
-                               SmallVector<PointerDiffInfoValues> RTChecks,
-                               bool &HasAliasMask);
+  VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range,
+                                        ArrayRef<PointerDiffInfo> RTChecks,
+                                        bool &HasAliasMask);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
@@ -451,7 +450,7 @@ class LoopVectorizationPlanner {
   /// RTChecks contains a list of pointer pairs that an alias mask should be
   /// generated for.
   void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
-                                SmallVector<PointerDiffInfoValues> RTChecks,
+                                ArrayRef<PointerDiffInfo> RTChecks,
                                 bool &HasAliasMask);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a48efc3dea6b81..8ae45ad1a8316e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1936,10 +1936,6 @@ class GeneratedRTChecks {
     OuterLoop = L->getParentLoop();
   }
 
-  Value *expandCodeForMemCheck(const SCEV *Scev, Instruction *Loc) {
-    return MemCheckExp.expandCodeFor(Scev, Scev->getType(), Loc);
-  }
-
   InstructionCost getCost() {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
@@ -6905,10 +6901,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-std::optional<VectorizationFactor>
-LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
-                               std::optional<ArrayRef<PointerDiffInfo>> RTChecks,
-                               std::function<Value*(const SCEV*)> Expander, bool &HasAliasMask) {
+std::optional<VectorizationFactor> LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC,
+    std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6919,15 +6914,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
 
   // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
   // here and put them into a list.
-  SmallVector<PointerDiffInfoValues> DiffChecksValues;
-  if (RTChecks.has_value()
-      && useActiveLaneMask(CM.getTailFoldingStyle(true))) {
-    for (auto Check : *RTChecks) {
-      Value *Sink = Expander(Check.SinkStart);
-      Value *Src = Expander(Check.SrcStart);
-      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
-    }
-  }
+  ArrayRef<PointerDiffInfo> DiffChecks;
+  if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
+    DiffChecks = *RTChecks;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
@@ -6957,7 +6946,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
     CM.collectInLoopReductions();
     if (CM.selectUserVectorizationFactor(UserVF)) {
       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-      buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecksValues, HasAliasMask);
+      buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
       if (!hasPlanWithVF(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
                           << ".\n");
@@ -6992,9 +6981,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           DiffChecksValues, HasAliasMask);
+                           DiffChecks, HasAliasMask);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           DiffChecksValues, HasAliasMask);
+                           DiffChecks, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
   if (VPlans.empty())
@@ -8387,8 +8376,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
-    ElementCount MinVF, ElementCount MaxVF,
-    SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask) {
+    ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
+    bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
@@ -8534,8 +8523,7 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VFRange &Range, SmallVector<PointerDiffInfoValues> RTChecks,
-    bool &HasAliasMask) {
+    VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -8584,8 +8572,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     VPBuilder Builder(VecPreheader);
     for (auto C : RTChecks) {
       HasAliasMask = true;
-      VPValue *Sink = Plan->getOrAddLiveIn(C.Sink);
-      VPValue *Src = Plan->getOrAddLiveIn(C.Src);
+      VPValue *Sink = vputils::getOrCreateVPValueForSCEVExpr(*Plan, C.SinkStart,
+                                                             *PSE.getSE());
+      VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(*Plan, C.SrcStart,
+                                                            *PSE.getSE());
       VPValue *M =
           Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
                                "active.lane.mask.alias");
@@ -9921,11 +9911,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                            AddBranchWeights);
 
   // Plan how to best vectorize, return the best VF and its cost.
-  auto Expand = [&Checks, &L](const SCEV *S) {
-    return Checks.expandCodeForMemCheck(S, L->getLoopPreheader()->getTerminator());
-  };
   std::optional<VectorizationFactor> MaybeVF =
-      LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
+      LVP.plan(UserVF, UserIC,
+               LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
+               Checks.HasAliasMask);
   if (Checks.HasAliasMask)
     LoopsAliasMasked++;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index b8053bb31b58e3..a2c55461318984 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -146,12 +146,14 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
 ; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[SRC3:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST2:%.*]] = ptrtoint ptr [[DST]] to i64
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
-; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST2]], [[SRC3]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index b381392ebf9f16..596e42e9f094de 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -9,8 +9,8 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add = add nuw nsw i32 %i.09, 1
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %data, i32 %add
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i32, ptr %arrayidx, align 4
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %1, 5
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i32, ptr %arrayidx, align 4
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %0, 5
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %i.09
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add1, ptr %arrayidx2, align 4
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %add, %n

>From 5075605d579f167740c632a62e9b37b66381f962 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 14 Aug 2024 09:39:39 +0100
Subject: [PATCH 9/9] Make alias lane mask a recipe

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  40 ++++++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 111 +++++++++---------
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 4 files changed, 96 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8ae45ad1a8316e..b951b1b3f998dd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8576,9 +8576,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                                                              *PSE.getSE());
       VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(*Plan, C.SrcStart,
                                                             *PSE.getSE());
-      VPValue *M =
-          Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
-                               "active.lane.mask.alias");
+      VPAliasLaneMaskRecipe *M = new VPAliasLaneMaskRecipe(Src, Sink);
+      VecPreheader->appendRecipe(M);
       if (AliasMask)
         AliasMask = Builder.createAnd(AliasMask, M);
       else
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 49ac2a42fbca8f..30213f6ff31648 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -908,6 +908,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
+    case VPRecipeBase::VPAliasLaneMaskSC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
@@ -1240,7 +1241,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
-    AliasLaneMask,
     ExplicitVectorLength,
     /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
     /// The first operand is the incoming value from the predecessor in VPlan,
@@ -2677,6 +2677,44 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   }
 };
 
+// Given a pointer A that is being stored to, and pointer B that is being
+// read from, both with unknown lengths, create a mask that disables
+// elements which could overlap across a loop iteration. For example, if A
+// is X and B is X + 2 with VF being 4, only the final two elements of the
+// loaded vector can be stored since they don't overlap with the stored
+// vector. %b.vec = load %b ; = [s, t, u, v]
+// [...]
+// store %a, %b.vec ; only u and v can be stored as their addresses don't
+// overlap with %a + (VF - 1)
+class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
+
+public:
+  VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink)
+      : VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}) {}
+
+  ~VPAliasLaneMaskRecipe() override = default;
+
+  VPAliasLaneMaskRecipe *clone() override {
+    return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
+
+  void execute(VPTransformState &State) override;
+
+  /// Get the VPValue* for the pointer being read from
+  VPValue *getSourceValue() const { return getOperand(0); }
+
+  /// Get the VPValue* for the pointer being stored to
+  VPValue *getSinkValue() const { return getOperand(1); }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c48fc71d156cfc..78bb6113186e9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -431,60 +431,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
   }
-  case VPInstruction::AliasLaneMask: {
-    // Given a pointer A that is being stored to, and pointer B that is being
-    // read from, both with unknown lengths, create a mask that disables
-    // elements which could overlap across a loop iteration. For example, if A
-    // is X and B is X + 2 with VF being 4, only the final two elements of the
-    // loaded vector can be stored since they don't overlap with the stored
-    // vector. %b.vec = load %b ; = [s, t, u, v]
-    // [...]
-    // store %a, %b.vec ; only u and v can be stored as their addresses don't
-    // overlap with %a + (VF - 1)
-    Value *ReadPtr = State.get(getOperand(0), VPIteration(Part, 0));
-    Value *StorePtr = State.get(getOperand(1), VPIteration(Part, 0));
-    unsigned ElementSize = 0;
-
-    // We expect the operands to the alias mask to be ptrtoint. Sometimes it's
-    // an add of a ptrtoint.
-    auto *ReadInsn = cast<Instruction>(ReadPtr);
-    auto *ReadCast = dyn_cast<CastInst>(ReadPtr);
-    if (ReadInsn->getOpcode() == Instruction::Add)
-      ReadCast = dyn_cast<CastInst>(ReadInsn->getOperand(0));
-
-    if (ReadCast && ReadCast->getOpcode() == Instruction::PtrToInt) {
-      Value *Ptr = ReadCast->getOperand(0);
-      for (auto *Use : Ptr->users()) {
-        if (auto *GEP = dyn_cast<GetElementPtrInst>(Use)) {
-          auto *EltVT = GEP->getSourceElementType();
-          if (EltVT->isArrayTy())
-            ElementSize = EltVT->getArrayElementType()->getScalarSizeInBits() *
-                          EltVT->getArrayNumElements();
-          else
-            ElementSize =
-                GEP->getSourceElementType()->getScalarSizeInBits() / 8;
-          break;
-        }
-      }
-    }
-    assert(ElementSize > 0 && "Couldn't get element size from pointer");
-    // Calculate how many elements the pointers differ by
-    Value *Diff = Builder.CreateSub(StorePtr, ReadPtr, "sub.diff");
-    auto *Type = Diff->getType();
-    Value *MemEltSize = ConstantInt::get(Type, ElementSize);
-    Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
-    // If the difference is negative then some elements may alias
-    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, DiffDiv,
-                                    ConstantInt::get(Type, 0), "neg.compare");
-    // Splat the compare result then OR it with a lane mask
-    Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
-    Value *DiffMask = Builder.CreateIntrinsic(
-        Intrinsic::get_active_lane_mask,
-        {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
-        {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
-    return Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat,
-                               Name);
-  }
   // Count the number of bits set in each lane and reduce the result to a scalar
   case VPInstruction::PopCount: {
     if (Part != 0)
@@ -901,9 +847,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumePhi:
     O << "resume-phi";
     break;
-  case VPInstruction::AliasLaneMask:
-    O << "alias lane mask";
-    break;
   case VPInstruction::PopCount:
     O << "popcount";
     break;
@@ -2549,6 +2492,60 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPAliasLaneMaskRecipe::execute(VPTransformState &State) {
+  IRBuilderBase Builder = State.Builder;
+  Value *SinkValue = State.get(getSinkValue(), 0, true);
+  Value *SourceValue = State.get(getSourceValue(), 0, true);
+
+  unsigned ElementSize = 0;
+  auto *ReadInsn = cast<Instruction>(SourceValue);
+  auto *ReadCast = dyn_cast<CastInst>(SourceValue);
+  if (ReadInsn->getOpcode() == Instruction::Add)
+    ReadCast = dyn_cast<CastInst>(ReadInsn->getOperand(0));
+
+  if (ReadCast && ReadCast->getOpcode() == Instruction::PtrToInt) {
+    Value *Ptr = ReadCast->getOperand(0);
+    for (auto *Use : Ptr->users()) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(Use)) {
+        auto *EltVT = GEP->getSourceElementType();
+        if (EltVT->isArrayTy())
+          ElementSize = EltVT->getArrayElementType()->getScalarSizeInBits() *
+                        EltVT->getArrayNumElements();
+        else
+          ElementSize = GEP->getSourceElementType()->getScalarSizeInBits() / 8;
+        break;
+      }
+    }
+  }
+  assert(ElementSize > 0 && "Couldn't get element size from pointer");
+
+  Value *Diff = Builder.CreateSub(SourceValue, SinkValue, "sub.diff");
+  auto *Type = Diff->getType();
+  Value *MemEltSize = ConstantInt::get(Type, ElementSize);
+  Value *DiffDiv = Builder.CreateSDiv(Diff, MemEltSize, "diff");
+  // If the difference is negative then some elements may alias
+  Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_SLT, DiffDiv,
+                                  ConstantInt::get(Type, 0), "neg.compare");
+  // Splat the compare result then OR it with a lane mask
+  Value *Splat = Builder.CreateVectorSplat(State.VF, Cmp);
+  Value *DiffMask = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask,
+      {VectorType::get(Builder.getInt1Ty(), State.VF), Type},
+      {ConstantInt::get(Type, 0), DiffDiv}, nullptr, "ptr.diff.lane.mask");
+  Value *Or = Builder.CreateBinOp(Instruction::BinaryOps::Or, DiffMask, Splat);
+  State.set(this, Or, 0, /*IsScalar=*/false);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPAliasLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "ALIAS-LANE-MASK ";
+  getSourceValue()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getSinkValue()->printAsOperand(O, SlotTracker);
+}
+#endif
+
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a773..32b21ddfeb7710 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -338,6 +338,7 @@ class VPDef {
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
+    VPAliasLaneMaskSC,
     VPExpandSCEVSC,
     VPInstructionSC,
     VPInterleaveSC,