[llvm] f831463 - [MemoryLocation] Size Scalable Masked MemOps (#154785)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 04:02:44 PDT 2025
Author: Matthew Devereau
Date: 2025-09-04T12:02:40+01:00
New Revision: f831463704e163030c5dc374c406e9f4126a436e
URL: https://github.com/llvm/llvm-project/commit/f831463704e163030c5dc374c406e9f4126a436e
DIFF: https://github.com/llvm/llvm-project/commit/f831463704e163030c5dc374c406e9f4126a436e.diff
LOG: [MemoryLocation] Size Scalable Masked MemOps (#154785)
Scalable masked loads and stores with a get active lane mask whose size
is less than or equal to the scalable minimum number of elements can be
be proven to have a fixed size. Adding this infomation allows scalable
masked loads and stores to benefit from alias analysis optimizations.
Added:
llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
Modified:
llvm/lib/Analysis/MemoryLocation.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 72b643c56a994..dcc51178b975a 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -12,6 +12,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include <optional>
using namespace llvm;
@@ -150,6 +151,33 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
}
+// If the mask for a memory op is a get active lane mask intrinsic
+// we can possibly infer the size of memory written or read
+static std::optional<FixedVectorType *>
+getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
+ using namespace llvm::PatternMatch;
+ ConstantInt *Op0, *Op1;
+ if (!match(Mask, m_Intrinsic<Intrinsic::get_active_lane_mask>(
+ m_ConstantInt(Op0), m_ConstantInt(Op1))))
+ return std::nullopt;
+
+ APInt LaneMaskLo = Op0->getValue();
+ APInt LaneMaskHi = Op1->getValue();
+ if (LaneMaskHi.ule(LaneMaskLo))
+ return std::nullopt;
+
+ APInt NumElts = LaneMaskHi - LaneMaskLo;
+ if (NumElts.ugt(Ty->getElementCount().getKnownMinValue())) {
+ if (isa<ScalableVectorType>(Ty))
+ return std::nullopt;
+ // Unlike scalable vectors, fixed vector types are guaranteed to handle the
+ // KnownMinValue and can be clamped
+ NumElts = Ty->getElementCount().getKnownMinValue();
+ }
+
+ return FixedVectorType::get(Ty->getElementType(), NumElts.getZExtValue());
+}
+
MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
unsigned ArgIdx,
const TargetLibraryInfo *TLI) {
@@ -213,20 +241,26 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()),
AATags);
- case Intrinsic::masked_load:
+ case Intrinsic::masked_load: {
assert(ArgIdx == 0 && "Invalid argument index");
- return MemoryLocation(
- Arg,
- LocationSize::upperBound(DL.getTypeStoreSize(II->getType())),
- AATags);
- case Intrinsic::masked_store:
+ auto *Ty = cast<VectorType>(II->getType());
+ if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
+ return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
+
+ return MemoryLocation(
+ Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
+ }
+ case Intrinsic::masked_store: {
assert(ArgIdx == 1 && "Invalid argument index");
+
+ auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType());
+ if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
+ return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
+
return MemoryLocation(
- Arg,
- LocationSize::upperBound(
- DL.getTypeStoreSize(II->getArgOperand(0)->getType())),
- AATags);
+ Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
+ }
case Intrinsic::invariant_end:
// The first argument to an invariant.end is a "descriptor" type (e.g. a
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
new file mode 100644
index 0000000000000..7e980c9bfe38a
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -0,0 +1,248 @@
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.32 = getelementptr inbounds nuw i8, ptr %0, i64 32
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <4 x float> @dead_scalable_store_fixed(ptr %0) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %mask2 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 3)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.36 = getelementptr inbounds nuw i8, ptr %0, i64 36
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.36 = getelementptr inbounds nuw i8, ptr %arr, i64 36
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+
+ %load.0.36 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.36, i32 1, <4 x i1> %mask2, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.36, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+
+ %load.0.48 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+
+ %faddop0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %faddop1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fadd = fadd <4 x float> %faddop0, %faddop1
+
+ ret <4 x float> %fadd
+}
+
+define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+ %gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+ %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
+
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.46 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+
+ %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
+; CHECK-NOT: store i32 20, ptr %gep.1.12
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 20, ptr %gep.1.12
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-NOT: store i32 20, ptr %1
+; CHECK: store i32 50, ptr %gep.5
+define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+ store i32 20, ptr %1
+
+ %gep.5 = getelementptr inbounds nuw i32, ptr %1, i64 5
+ store i32 50, ptr %gep.5
+
+ %load.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0, ptr nonnull %1, i32 1, <4 x i1> %mask)
+ %retval = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %retval
+}
+
+; We don't know if the scalar store is dead as we can't determine vscale.
+; This get active lane mask may cover 4 or 8 integers
+define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 10, ptr %gep.1.12
+; CHECK: store i32 20, ptr %gep.1.28
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 10, ptr %gep.1.12
+ %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
+ store i32 20, ptr %gep.1.28
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the mask's Op1 < Op0
+define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
+; CHECK: store i32 20, ptr %1
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+ store i32 20, ptr %1
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the mask's Op1 == Op0
+define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
+; CHECK: store i32 20, ptr %1
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
+ store i32 20, ptr %1
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
+; CHECK-NOT: store i8 60, ptr %gep.1.6
+; CHECK: store i8 120, ptr %gep.1.8
+;
+ %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
+ %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
+ store i8 60, ptr %gep.1.6
+ %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+ store i8 120, ptr %gep.1.8
+
+ %load.0 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
+ %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ ret <vscale x 16 x i8> %retval
+}
+
+define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
+; CHECK-NOT: store i32 10, ptr %gep.1.0
+; CHECK-NOT: store i32 20, ptr %gep.1.4
+; CHECK-NOT: store i32 30, ptr %gep.1.8
+; CHECK: store i32 40, ptr %gep.1.12
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+ %gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
+ store i32 10, ptr %gep.1.0
+ %gep.1.4 = getelementptr inbounds nuw i8, ptr %1, i64 4
+ store i32 20, ptr %gep.1.4
+ %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+ store i32 30, ptr %gep.1.8
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 40, ptr %gep.1.12
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
More information about the llvm-commits
mailing list