[llvm] [MemoryLocation] Size Scalable Masked MemOps (PR #154785)

Thu Aug 28 10:01:23 PDT 2025

================
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.1.16 = getelementptr inbounds nuw i8, ptr %1, i64 16
+  %gep.1.32 = getelementptr inbounds nuw i8, ptr %1, i64 32
+  %gep.1.48 = getelementptr inbounds nuw i8, ptr %1, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.1.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.1.32 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.1.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+  %gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+  %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
+
+  %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.46 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+
+  %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
+; CHECK-NOT: store i32 20, ptr %gep.1.12
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 20, ptr %gep.1.12
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+; We don't know if the scalar store is dead as we can't determine vscale.
+; This get active lane mask may cover 4 or 8 integers
+define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 20, ptr %gep.1.28
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+  %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
+  store i32 20, ptr %gep.1.28
+
+  %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the 2nd Op of get active lane mask is 0. This currently generates poison
+define <vscale x 4 x float> @mask_hi_0(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_hi_0(
+; CHECK: store i32 20, ptr %1
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 0)
----------------
david-arm wrote:

I'm not sure how useful this test is because if we did treat (0,0) as returning an all-false mask (instead of poison according to the LangRef), then you'd expect the masked store below to be a nop, since it doesn't write to anything. In this case I'd still expect the scalar store to remain as before. Perhaps a test like this?

```
define <4 x i32> @mask_hi_0(ptr %0) {
  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 0)
  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat(i32 1), ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask)
  store i8 3, ptr %0
  store i32 20, ptr %0
  %retval = load <4 x i32>, ptr %0, align 1
  ret <4 x i32> %retval
}
```

You can see that the `store i8 3, ptr %0` gets deleted, but the masked store remains. We should be able to kill off the masked store as well.

Incidentally, the pass doesn't deal with fixed-width constant masks either such as `<i32 1, i32 1, i32 0, i32 0>` or 'zeroinitializer`, but perhaps that can be done in a different patch?

https://github.com/llvm/llvm-project/pull/154785