[llvm] [MemoryLocation] Size Scalable Masked MemOps (PR #154785)

Mon Sep 1 08:19:48 PDT 2025

https://github.com/MDevereau updated https://github.com/llvm/llvm-project/pull/154785

>From 76a56aeb5b5faca73f32fec0db04e6d2ce98bde7 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Fri, 8 Aug 2025 13:06:20 +0000
Subject: [PATCH 1/3] [MemoryLocation] Size Scalable Masked MemOps

Scalable masked loads and stores with a get active lane mask whose size is less
than or equal to the scalable minimum number of elements can be be proven to
have a fixed size. Adding this infomation allows scalable masked loads and
stores to benefit from alias analysis optimizations.
---
 llvm/lib/Analysis/MemoryLocation.cpp          |  55 +++++--
 llvm/test/Analysis/BasicAA/scalable-dse-aa.ll | 145 ++++++++++++++++++
 2 files changed, 189 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Analysis/BasicAA/scalable-dse-aa.ll

diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 72b643c56a994..f2c3b843f70f6 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -150,6 +150,29 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
   return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
 }
 
+static std::optional<FixedVectorType *>
+getFixedTypeFromScalableMemOp(Value *Mask, Type *Ty) {
+  auto ActiveLaneMask = dyn_cast<IntrinsicInst>(Mask);
+  if (!ActiveLaneMask ||
+      ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
+    return std::nullopt;
+
+  auto ScalableTy = dyn_cast<ScalableVectorType>(Ty);
+  if (!ScalableTy)
+    return std::nullopt;
+
+  auto LaneMaskLo = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(0));
+  auto LaneMaskHi = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(1));
+  if (!LaneMaskLo || !LaneMaskHi)
+    return std::nullopt;
+
+  uint64_t NumElts = LaneMaskHi->getZExtValue() - LaneMaskLo->getZExtValue();
+  if (NumElts > ScalableTy->getMinNumElements())
+    return std::nullopt;
+
+  return FixedVectorType::get(ScalableTy->getElementType(), NumElts);
+}
+
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
                                               unsigned ArgIdx,
                                               const TargetLibraryInfo *TLI) {
@@ -213,20 +236,30 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
               cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()),
           AATags);
 
-    case Intrinsic::masked_load:
+    case Intrinsic::masked_load: {
       assert(ArgIdx == 0 && "Invalid argument index");
-      return MemoryLocation(
-          Arg,
-          LocationSize::upperBound(DL.getTypeStoreSize(II->getType())),
-          AATags);
 
-    case Intrinsic::masked_store:
+      Type *Ty = II->getType();
+      auto KnownScalableSize =
+          getFixedTypeFromScalableMemOp(II->getOperand(2), Ty);
+      if (KnownScalableSize)
+        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
+                              AATags);
+
+      return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+    }
+    case Intrinsic::masked_store: {
       assert(ArgIdx == 1 && "Invalid argument index");
-      return MemoryLocation(
-          Arg,
-          LocationSize::upperBound(
-              DL.getTypeStoreSize(II->getArgOperand(0)->getType())),
-          AATags);
+
+      Type *Ty = II->getArgOperand(0)->getType();
+      auto KnownScalableSize =
+          getFixedTypeFromScalableMemOp(II->getOperand(3), Ty);
+      if (KnownScalableSize)
+        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
+                              AATags);
+
+      return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+    }
 
     case Intrinsic::invariant_end:
       // The first argument to an invariant.end is a "descriptor" type (e.g. a
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
new file mode 100644
index 0000000000000..c12d1c2f25835
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.1.16 = getelementptr inbounds nuw i8, ptr %1, i64 16
+  %gep.1.32 = getelementptr inbounds nuw i8, ptr %1, i64 32
+  %gep.1.48 = getelementptr inbounds nuw i8, ptr %1, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.1.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.1.32 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.1.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+  %gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+  %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
+
+  %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.46 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+
+  %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
+; CHECK-NOT: store i32 20, ptr %gep.1.12
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 20, ptr %gep.1.12
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+; We don't know if the scalar store is dead as we can't determine vscale.
+; This get active lane mask may cover 4 or 8 integers
+define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 20, ptr %gep.1.28
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+  %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
+  store i32 20, ptr %gep.1.28
+
+  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
+; CHECK-NOT: store i8 60, ptr %gep.1.6
+; CHECK: store i8 120, ptr %gep.1.8
+;
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i32(i32 0, i32 7)
+  %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
+  store i8 60, ptr %gep.1.6
+  %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+  store i8 120,   ptr %gep.1.8
+
+  %load.0 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+  call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
+  %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %retval
+}

>From fc9274f5a9adf77471a2f627e6b9488af3ad3648 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Wed, 27 Aug 2025 09:27:28 +0000
Subject: [PATCH 2/3] Address review comments

Use patternmatch logic
Add pointer tokens to auto declarations
Add offset test dead_scalar_store_offset
---
 llvm/lib/Analysis/MemoryLocation.cpp          | 46 +++++++---------
 llvm/test/Analysis/BasicAA/scalable-dse-aa.ll | 53 ++++++++++++++++++-
 2 files changed, 72 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index f2c3b843f70f6..b2a36821829ca 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -12,6 +12,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include <optional>
 using namespace llvm;
@@ -150,27 +151,26 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
   return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
 }
 
+// If the mask for a memory op is a get active lane mask intrinsic
+// we can possibly infer the size of memory written or read
 static std::optional<FixedVectorType *>
-getFixedTypeFromScalableMemOp(Value *Mask, Type *Ty) {
-  auto ActiveLaneMask = dyn_cast<IntrinsicInst>(Mask);
-  if (!ActiveLaneMask ||
-      ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
+getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
+  using namespace llvm::PatternMatch;
+  ConstantInt *Op0, *Op1;
+  if (!match(Mask, m_Intrinsic<Intrinsic::get_active_lane_mask>(
+                       m_ConstantInt(Op0), m_ConstantInt(Op1))))
     return std::nullopt;
 
-  auto ScalableTy = dyn_cast<ScalableVectorType>(Ty);
-  if (!ScalableTy)
+  uint64_t LaneMaskLo = Op0->getZExtValue();
+  uint64_t LaneMaskHi = Op1->getZExtValue();
+  if ((LaneMaskHi == 0) || (LaneMaskHi <= LaneMaskLo))
     return std::nullopt;
 
-  auto LaneMaskLo = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(0));
-  auto LaneMaskHi = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(1));
-  if (!LaneMaskLo || !LaneMaskHi)
+  uint64_t NumElts = LaneMaskHi - LaneMaskLo;
+  if (NumElts > Ty->getElementCount().getKnownMinValue())
     return std::nullopt;
 
-  uint64_t NumElts = LaneMaskHi->getZExtValue() - LaneMaskLo->getZExtValue();
-  if (NumElts > ScalableTy->getMinNumElements())
-    return std::nullopt;
-
-  return FixedVectorType::get(ScalableTy->getElementType(), NumElts);
+  return FixedVectorType::get(Ty->getElementType(), NumElts);
 }
 
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
@@ -239,24 +239,18 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
     case Intrinsic::masked_load: {
       assert(ArgIdx == 0 && "Invalid argument index");
 
-      Type *Ty = II->getType();
-      auto KnownScalableSize =
-          getFixedTypeFromScalableMemOp(II->getOperand(2), Ty);
-      if (KnownScalableSize)
-        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
-                              AATags);
+      auto *Ty = cast<VectorType>(II->getType());
+      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
+        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
 
       return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
     }
     case Intrinsic::masked_store: {
       assert(ArgIdx == 1 && "Invalid argument index");
 
-      Type *Ty = II->getArgOperand(0)->getType();
-      auto KnownScalableSize =
-          getFixedTypeFromScalableMemOp(II->getOperand(3), Ty);
-      if (KnownScalableSize)
-        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
-                              AATags);
+      auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType());
+      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
+        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
 
       return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
     }
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index c12d1c2f25835..12e014ec8b1ec 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -127,12 +127,40 @@ define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
   ret <vscale x 4 x float> %retval
 }
 
+; Don't do anything if the 2nd Op of get active lane mask is 0. This currently generates poison
+define <vscale x 4 x float> @mask_hi_0(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_hi_0(
+; CHECK: store i32 20, ptr %1
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 0)
+  store i32 20, ptr %1
+
+  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the 2nd Op is gt/eq the 1st
+define <vscale x 4 x float> @active_lane_mask_gt_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_gt_eq(
+; CHECK: store i32 20, ptr %1
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+  store i32 20, ptr %1
+
+  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
 define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
 ; CHECK-NOT: store i8 60, ptr %gep.1.6
 ; CHECK: store i8 120, ptr %gep.1.8
 ;
-  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i32(i32 0, i32 7)
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
   %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
   store i8 60, ptr %gep.1.6
   %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
@@ -143,3 +171,26 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
   %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
   ret <vscale x 16 x i8> %retval
 }
+
+define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
+; CHECK-NOT: store i32 10, ptr %gep.1.0
+; CHECK-NOT: store i32 20, ptr %gep.1.4
+; CHECK-NOT: store i32 30, ptr %gep.1.8
+; CHECK: store i32 40, ptr %gep.1.12
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+  %gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
+  store i32 10, ptr %gep.1.0
+  %gep.1.4 = getelementptr inbounds nuw i8, ptr %1, i64 4
+  store i32 20, ptr %gep.1.4
+  %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+  store i32 30, ptr %gep.1.8
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 40, ptr %gep.1.12
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
\ No newline at end of file

>From 7f8e9c8968f6af99bbbbede48683c6ac4b08b574 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Mon, 1 Sep 2025 15:10:18 +0000
Subject: [PATCH 3/3] Add fixed-width tests

Use APInt maths
Refine tests
---
 llvm/lib/Analysis/MemoryLocation.cpp          |  25 ++--
 llvm/test/Analysis/BasicAA/scalable-dse-aa.ll | 122 +++++++++++++-----
 2 files changed, 103 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index b2a36821829ca..dcc51178b975a 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -161,16 +161,21 @@ getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
                        m_ConstantInt(Op0), m_ConstantInt(Op1))))
     return std::nullopt;
 
-  uint64_t LaneMaskLo = Op0->getZExtValue();
-  uint64_t LaneMaskHi = Op1->getZExtValue();
-  if ((LaneMaskHi == 0) || (LaneMaskHi <= LaneMaskLo))
+  APInt LaneMaskLo = Op0->getValue();
+  APInt LaneMaskHi = Op1->getValue();
+  if (LaneMaskHi.ule(LaneMaskLo))
     return std::nullopt;
 
-  uint64_t NumElts = LaneMaskHi - LaneMaskLo;
-  if (NumElts > Ty->getElementCount().getKnownMinValue())
-    return std::nullopt;
+  APInt NumElts = LaneMaskHi - LaneMaskLo;
+  if (NumElts.ugt(Ty->getElementCount().getKnownMinValue())) {
+    if (isa<ScalableVectorType>(Ty))
+      return std::nullopt;
+    // Unlike scalable vectors, fixed vector types are guaranteed to handle the
+    // KnownMinValue and can be clamped
+    NumElts = Ty->getElementCount().getKnownMinValue();
+  }
 
-  return FixedVectorType::get(Ty->getElementType(), NumElts);
+  return FixedVectorType::get(Ty->getElementType(), NumElts.getZExtValue());
 }
 
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
@@ -243,7 +248,8 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
         return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
 
-      return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+      return MemoryLocation(
+          Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
     }
     case Intrinsic::masked_store: {
       assert(ArgIdx == 1 && "Invalid argument index");
@@ -252,7 +258,8 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
         return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
 
-      return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+      return MemoryLocation(
+          Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
     }
 
     case Intrinsic::invariant_end:
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index 12e014ec8b1ec..7e980c9bfe38a 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -1,30 +1,29 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
 
-define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
+define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
 ;
   %arr = alloca [64 x i32], align 4
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
 
-  %gep.1.16 = getelementptr inbounds nuw i8, ptr %1, i64 16
-  %gep.1.32 = getelementptr inbounds nuw i8, ptr %1, i64 32
-  %gep.1.48 = getelementptr inbounds nuw i8, ptr %1, i64 48
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.32 = getelementptr inbounds nuw i8, ptr %0, i64 32
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
   %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
   %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
   %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
 
-  %load.1.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
-  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
 
-  %load.1.32 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
-  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+  %load.0.32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
 
-  %load.1.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
-  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+  %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
 
   %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
@@ -33,6 +32,39 @@ define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
   ret <vscale x 4 x float> %fadd
 }
 
+define <4 x float> @dead_scalable_store_fixed(ptr %0) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+  %mask2 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 3)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.36 = getelementptr inbounds nuw i8, ptr %0, i64 36
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.36 = getelementptr inbounds nuw i8, ptr %arr, i64 36
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+
+  %load.0.36 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.36, i32 1, <4 x i1> %mask2, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.36, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+
+  %load.0.48 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+
+  %faddop0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %faddop1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %fadd = fadd <4 x float> %faddop0, %faddop1
+
+  ret <4 x float> %fadd
+}
+
 define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
 ; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
 ; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
@@ -49,13 +81,13 @@ define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
   %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
   %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
 
-  %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
 
-  %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
 
-  %load.0.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
 
   %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
@@ -80,13 +112,13 @@ define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
   %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
   %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
 
-  %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
 
-  %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
 
-  %load.0.46 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0.46 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
 
   %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
@@ -101,7 +133,7 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
 ; CHECK-NOT: store i32 20, ptr %gep.1.12
 ;
-  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
   %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
   store i32 20, ptr %gep.1.12
 
@@ -111,45 +143,65 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
   ret <vscale x 4 x float> %retval
 }
 
+
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-NOT: store i32 20, ptr %1
+; CHECK: store i32 50, ptr %gep.5
+define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+  %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+  store i32 20, ptr %1
+
+  %gep.5 = getelementptr inbounds nuw i32, ptr %1, i64 5
+  store i32 50, ptr %gep.5
+
+  %load.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0, ptr nonnull %1, i32 1, <4 x i1> %mask)
+  %retval = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  ret <4 x float> %retval
+}
+
 ; We don't know if the scalar store is dead as we can't determine vscale.
 ; This get active lane mask may cover 4 or 8 integers
 define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 10, ptr %gep.1.12
 ; CHECK: store i32 20, ptr %gep.1.28
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 10, ptr %gep.1.12
   %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
   store i32 20, ptr %gep.1.28
 
-  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
   %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   ret <vscale x 4 x float> %retval
 }
 
-; Don't do anything if the 2nd Op of get active lane mask is 0. This currently generates poison
-define <vscale x 4 x float> @mask_hi_0(ptr noalias %0, ptr %1) {
-; CHECK-LABEL: define <vscale x 4 x float> @mask_hi_0(
+; Don't do anything if the mask's Op1 < Op0
+define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
 ; CHECK: store i32 20, ptr %1
 ;
-  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 0)
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
   store i32 20, ptr %1
 
-  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
   %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   ret <vscale x 4 x float> %retval
 }
 
-; Don't do anything if the 2nd Op is gt/eq the 1st
-define <vscale x 4 x float> @active_lane_mask_gt_eq(ptr noalias %0, ptr %1) {
-; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_gt_eq(
+; Don't do anything if the mask's Op1 == Op0
+define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
 ; CHECK: store i32 20, ptr %1
 ;
-  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
   store i32 20, ptr %1
 
-  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
   %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   ret <vscale x 4 x float> %retval
@@ -164,9 +216,9 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
   %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
   store i8 60, ptr %gep.1.6
   %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
-  store i8 120,   ptr %gep.1.8
+  store i8 120, ptr %gep.1.8
 
-  %load.0 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+  %load.0 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
   call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
   %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
   ret <vscale x 16 x i8> %retval
@@ -193,4 +245,4 @@ define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
   call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
   %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
   ret <vscale x 4 x float> %retval
-}
\ No newline at end of file
+}