[llvm] [MemoryLocation] Size Scalable Masked MemOps (PR #154785)
Matthew Devereau via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 1 08:19:48 PDT 2025
https://github.com/MDevereau updated https://github.com/llvm/llvm-project/pull/154785
>From 76a56aeb5b5faca73f32fec0db04e6d2ce98bde7 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Fri, 8 Aug 2025 13:06:20 +0000
Subject: [PATCH 1/3] [MemoryLocation] Size Scalable Masked MemOps
Scalable masked loads and stores with a get active lane mask whose size is less
than or equal to the scalable minimum number of elements can be be proven to
have a fixed size. Adding this infomation allows scalable masked loads and
stores to benefit from alias analysis optimizations.
---
llvm/lib/Analysis/MemoryLocation.cpp | 55 +++++--
llvm/test/Analysis/BasicAA/scalable-dse-aa.ll | 145 ++++++++++++++++++
2 files changed, 189 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 72b643c56a994..f2c3b843f70f6 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -150,6 +150,29 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
}
+static std::optional<FixedVectorType *>
+getFixedTypeFromScalableMemOp(Value *Mask, Type *Ty) {
+ auto ActiveLaneMask = dyn_cast<IntrinsicInst>(Mask);
+ if (!ActiveLaneMask ||
+ ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
+ return std::nullopt;
+
+ auto ScalableTy = dyn_cast<ScalableVectorType>(Ty);
+ if (!ScalableTy)
+ return std::nullopt;
+
+ auto LaneMaskLo = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(0));
+ auto LaneMaskHi = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(1));
+ if (!LaneMaskLo || !LaneMaskHi)
+ return std::nullopt;
+
+ uint64_t NumElts = LaneMaskHi->getZExtValue() - LaneMaskLo->getZExtValue();
+ if (NumElts > ScalableTy->getMinNumElements())
+ return std::nullopt;
+
+ return FixedVectorType::get(ScalableTy->getElementType(), NumElts);
+}
+
MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
unsigned ArgIdx,
const TargetLibraryInfo *TLI) {
@@ -213,20 +236,30 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()),
AATags);
- case Intrinsic::masked_load:
+ case Intrinsic::masked_load: {
assert(ArgIdx == 0 && "Invalid argument index");
- return MemoryLocation(
- Arg,
- LocationSize::upperBound(DL.getTypeStoreSize(II->getType())),
- AATags);
- case Intrinsic::masked_store:
+ Type *Ty = II->getType();
+ auto KnownScalableSize =
+ getFixedTypeFromScalableMemOp(II->getOperand(2), Ty);
+ if (KnownScalableSize)
+ return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
+ AATags);
+
+ return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+ }
+ case Intrinsic::masked_store: {
assert(ArgIdx == 1 && "Invalid argument index");
- return MemoryLocation(
- Arg,
- LocationSize::upperBound(
- DL.getTypeStoreSize(II->getArgOperand(0)->getType())),
- AATags);
+
+ Type *Ty = II->getArgOperand(0)->getType();
+ auto KnownScalableSize =
+ getFixedTypeFromScalableMemOp(II->getOperand(3), Ty);
+ if (KnownScalableSize)
+ return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
+ AATags);
+
+ return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+ }
case Intrinsic::invariant_end:
// The first argument to an invariant.end is a "descriptor" type (e.g. a
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
new file mode 100644
index 0000000000000..c12d1c2f25835
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.1.16 = getelementptr inbounds nuw i8, ptr %1, i64 16
+ %gep.1.32 = getelementptr inbounds nuw i8, ptr %1, i64 32
+ %gep.1.48 = getelementptr inbounds nuw i8, ptr %1, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.1.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.1.32 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.1.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+ %gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+ %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
+
+ %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.46 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+
+ %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
+; CHECK-NOT: store i32 20, ptr %gep.1.12
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 20, ptr %gep.1.12
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+; We don't know if the scalar store is dead as we can't determine vscale.
+; This get active lane mask may cover 4 or 8 integers
+define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 20, ptr %gep.1.28
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
+ store i32 20, ptr %gep.1.28
+
+ %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
+; CHECK-NOT: store i8 60, ptr %gep.1.6
+; CHECK: store i8 120, ptr %gep.1.8
+;
+ %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i32(i32 0, i32 7)
+ %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
+ store i8 60, ptr %gep.1.6
+ %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+ store i8 120, ptr %gep.1.8
+
+ %load.0 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
+ %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ ret <vscale x 16 x i8> %retval
+}
>From fc9274f5a9adf77471a2f627e6b9488af3ad3648 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Wed, 27 Aug 2025 09:27:28 +0000
Subject: [PATCH 2/3] Address review comments
Use patternmatch logic
Add pointer tokens to auto declarations
Add offset test dead_scalar_store_offset
---
llvm/lib/Analysis/MemoryLocation.cpp | 46 +++++++---------
llvm/test/Analysis/BasicAA/scalable-dse-aa.ll | 53 ++++++++++++++++++-
2 files changed, 72 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index f2c3b843f70f6..b2a36821829ca 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -12,6 +12,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include <optional>
using namespace llvm;
@@ -150,27 +151,26 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
}
+// If the mask for a memory op is a get active lane mask intrinsic
+// we can possibly infer the size of memory written or read
static std::optional<FixedVectorType *>
-getFixedTypeFromScalableMemOp(Value *Mask, Type *Ty) {
- auto ActiveLaneMask = dyn_cast<IntrinsicInst>(Mask);
- if (!ActiveLaneMask ||
- ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
+getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
+ using namespace llvm::PatternMatch;
+ ConstantInt *Op0, *Op1;
+ if (!match(Mask, m_Intrinsic<Intrinsic::get_active_lane_mask>(
+ m_ConstantInt(Op0), m_ConstantInt(Op1))))
return std::nullopt;
- auto ScalableTy = dyn_cast<ScalableVectorType>(Ty);
- if (!ScalableTy)
+ uint64_t LaneMaskLo = Op0->getZExtValue();
+ uint64_t LaneMaskHi = Op1->getZExtValue();
+ if ((LaneMaskHi == 0) || (LaneMaskHi <= LaneMaskLo))
return std::nullopt;
- auto LaneMaskLo = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(0));
- auto LaneMaskHi = dyn_cast<ConstantInt>(ActiveLaneMask->getOperand(1));
- if (!LaneMaskLo || !LaneMaskHi)
+ uint64_t NumElts = LaneMaskHi - LaneMaskLo;
+ if (NumElts > Ty->getElementCount().getKnownMinValue())
return std::nullopt;
- uint64_t NumElts = LaneMaskHi->getZExtValue() - LaneMaskLo->getZExtValue();
- if (NumElts > ScalableTy->getMinNumElements())
- return std::nullopt;
-
- return FixedVectorType::get(ScalableTy->getElementType(), NumElts);
+ return FixedVectorType::get(Ty->getElementType(), NumElts);
}
MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
@@ -239,24 +239,18 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
case Intrinsic::masked_load: {
assert(ArgIdx == 0 && "Invalid argument index");
- Type *Ty = II->getType();
- auto KnownScalableSize =
- getFixedTypeFromScalableMemOp(II->getOperand(2), Ty);
- if (KnownScalableSize)
- return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
- AATags);
+ auto *Ty = cast<VectorType>(II->getType());
+ if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
+ return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
}
case Intrinsic::masked_store: {
assert(ArgIdx == 1 && "Invalid argument index");
- Type *Ty = II->getArgOperand(0)->getType();
- auto KnownScalableSize =
- getFixedTypeFromScalableMemOp(II->getOperand(3), Ty);
- if (KnownScalableSize)
- return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownScalableSize),
- AATags);
+ auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType());
+ if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
+ return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
}
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index c12d1c2f25835..12e014ec8b1ec 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -127,12 +127,40 @@ define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
ret <vscale x 4 x float> %retval
}
+; Don't do anything if the 2nd Op of get active lane mask is 0. This currently generates poison
+define <vscale x 4 x float> @mask_hi_0(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_hi_0(
+; CHECK: store i32 20, ptr %1
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 0)
+ store i32 20, ptr %1
+
+ %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the 2nd Op is gt/eq the 1st
+define <vscale x 4 x float> @active_lane_mask_gt_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_gt_eq(
+; CHECK: store i32 20, ptr %1
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+ store i32 20, ptr %1
+
+ %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
; CHECK-NOT: store i8 60, ptr %gep.1.6
; CHECK: store i8 120, ptr %gep.1.8
;
- %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i32(i32 0, i32 7)
+ %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
%gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
store i8 60, ptr %gep.1.6
%gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
@@ -143,3 +171,26 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
%retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
ret <vscale x 16 x i8> %retval
}
+
+define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
+; CHECK-NOT: store i32 10, ptr %gep.1.0
+; CHECK-NOT: store i32 20, ptr %gep.1.4
+; CHECK-NOT: store i32 30, ptr %gep.1.8
+; CHECK: store i32 40, ptr %gep.1.12
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+ %gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
+ store i32 10, ptr %gep.1.0
+ %gep.1.4 = getelementptr inbounds nuw i8, ptr %1, i64 4
+ store i32 20, ptr %gep.1.4
+ %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+ store i32 30, ptr %gep.1.8
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 40, ptr %gep.1.12
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
\ No newline at end of file
>From 7f8e9c8968f6af99bbbbede48683c6ac4b08b574 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Mon, 1 Sep 2025 15:10:18 +0000
Subject: [PATCH 3/3] Add fixed-width tests
Use APInt maths
Refine tests
---
llvm/lib/Analysis/MemoryLocation.cpp | 25 ++--
llvm/test/Analysis/BasicAA/scalable-dse-aa.ll | 122 +++++++++++++-----
2 files changed, 103 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index b2a36821829ca..dcc51178b975a 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -161,16 +161,21 @@ getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
m_ConstantInt(Op0), m_ConstantInt(Op1))))
return std::nullopt;
- uint64_t LaneMaskLo = Op0->getZExtValue();
- uint64_t LaneMaskHi = Op1->getZExtValue();
- if ((LaneMaskHi == 0) || (LaneMaskHi <= LaneMaskLo))
+ APInt LaneMaskLo = Op0->getValue();
+ APInt LaneMaskHi = Op1->getValue();
+ if (LaneMaskHi.ule(LaneMaskLo))
return std::nullopt;
- uint64_t NumElts = LaneMaskHi - LaneMaskLo;
- if (NumElts > Ty->getElementCount().getKnownMinValue())
- return std::nullopt;
+ APInt NumElts = LaneMaskHi - LaneMaskLo;
+ if (NumElts.ugt(Ty->getElementCount().getKnownMinValue())) {
+ if (isa<ScalableVectorType>(Ty))
+ return std::nullopt;
+ // Unlike scalable vectors, fixed vector types are guaranteed to handle the
+ // KnownMinValue and can be clamped
+ NumElts = Ty->getElementCount().getKnownMinValue();
+ }
- return FixedVectorType::get(Ty->getElementType(), NumElts);
+ return FixedVectorType::get(Ty->getElementType(), NumElts.getZExtValue());
}
MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
@@ -243,7 +248,8 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
- return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+ return MemoryLocation(
+ Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
}
case Intrinsic::masked_store: {
assert(ArgIdx == 1 && "Invalid argument index");
@@ -252,7 +258,8 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
- return MemoryLocation(Arg, DL.getTypeStoreSize(Ty), AATags);
+ return MemoryLocation(
+ Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
}
case Intrinsic::invariant_end:
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index 12e014ec8b1ec..7e980c9bfe38a 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -1,30 +1,29 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
-define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
+define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
;
%arr = alloca [64 x i32], align 4
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
- %gep.1.16 = getelementptr inbounds nuw i8, ptr %1, i64 16
- %gep.1.32 = getelementptr inbounds nuw i8, ptr %1, i64 32
- %gep.1.48 = getelementptr inbounds nuw i8, ptr %1, i64 48
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.32 = getelementptr inbounds nuw i8, ptr %0, i64 32
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
%gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
%gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
%gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
- %load.1.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
- call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
- %load.1.32 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
- call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+ %load.0.32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
- %load.1.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.1.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
- call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.1.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+ %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
%faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
%faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
@@ -33,6 +32,39 @@ define <vscale x 4 x float> @dead_scalable_store(i32 %0, ptr %1) {
ret <vscale x 4 x float> %fadd
}
+define <4 x float> @dead_scalable_store_fixed(ptr %0) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %mask2 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 3)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.36 = getelementptr inbounds nuw i8, ptr %0, i64 36
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.36 = getelementptr inbounds nuw i8, ptr %arr, i64 36
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+
+ %load.0.36 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.36, i32 1, <4 x i1> %mask2, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.36, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+
+ %load.0.48 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+
+ %faddop0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %faddop1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fadd = fadd <4 x float> %faddop0, %faddop1
+
+ ret <4 x float> %fadd
+}
+
define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
@@ -49,13 +81,13 @@ define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
%gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
%gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
- %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
- %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
- %load.0.48 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
%faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
@@ -80,13 +112,13 @@ define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
%gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
%gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
- %load.0.16 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
- %load.0.30 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
- %load.0.46 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0.46 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
%smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
@@ -101,7 +133,7 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
; CHECK-NOT: store i32 20, ptr %gep.1.12
;
- %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
%gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
store i32 20, ptr %gep.1.12
@@ -111,45 +143,65 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
ret <vscale x 4 x float> %retval
}
+
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-NOT: store i32 20, ptr %1
+; CHECK: store i32 50, ptr %gep.5
+define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+ store i32 20, ptr %1
+
+ %gep.5 = getelementptr inbounds nuw i32, ptr %1, i64 5
+ store i32 50, ptr %gep.5
+
+ %load.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0, ptr nonnull %1, i32 1, <4 x i1> %mask)
+ %retval = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %retval
+}
+
; We don't know if the scalar store is dead as we can't determine vscale.
; This get active lane mask may cover 4 or 8 integers
define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 10, ptr %gep.1.12
; CHECK: store i32 20, ptr %gep.1.28
;
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 10, ptr %gep.1.12
%gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
store i32 20, ptr %gep.1.28
- %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
ret <vscale x 4 x float> %retval
}
-; Don't do anything if the 2nd Op of get active lane mask is 0. This currently generates poison
-define <vscale x 4 x float> @mask_hi_0(ptr noalias %0, ptr %1) {
-; CHECK-LABEL: define <vscale x 4 x float> @mask_hi_0(
+; Don't do anything if the mask's Op1 < Op0
+define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
; CHECK: store i32 20, ptr %1
;
- %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 0)
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
store i32 20, ptr %1
- %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
ret <vscale x 4 x float> %retval
}
-; Don't do anything if the 2nd Op is gt/eq the 1st
-define <vscale x 4 x float> @active_lane_mask_gt_eq(ptr noalias %0, ptr %1) {
-; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_gt_eq(
+; Don't do anything if the mask's Op1 == Op0
+define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
; CHECK: store i32 20, ptr %1
;
- %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
store i32 20, ptr %1
- %load.0 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
ret <vscale x 4 x float> %retval
@@ -164,9 +216,9 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
%gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
store i8 60, ptr %gep.1.6
%gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
- store i8 120, ptr %gep.1.8
+ store i8 120, ptr %gep.1.8
- %load.0 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ %load.0 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
%retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
ret <vscale x 16 x i8> %retval
@@ -193,4 +245,4 @@ define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
ret <vscale x 4 x float> %retval
-}
\ No newline at end of file
+}
More information about the llvm-commits
mailing list