[llvm] [GVN] Teach GVN simple masked load/store forwarding (PR #157689)
Matthew Devereau via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 02:55:13 PDT 2025
https://github.com/MDevereau updated https://github.com/llvm/llvm-project/pull/157689
>From 11ac166c1a8b1ebccb528094fe768aaa93668f1d Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Fri, 5 Sep 2025 11:54:10 +0000
Subject: [PATCH 1/3] [GVN] Teach GVN simple masked load/store forwarding
This patch teaches GVN how to eliminate redundant masked loads and forward
previous loads or instructions with a select. This is possible when the same
mask is used for masked stores/loads that write to the same memory location
---
llvm/include/llvm/Transforms/Scalar/GVN.h | 2 +
llvm/lib/Transforms/Scalar/GVN.cpp | 50 ++++++
llvm/test/Transforms/GVN/masked-load-store.ll | 158 ++++++++++++++++++
3 files changed, 210 insertions(+)
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 245414935bc0f..74a4d6ce00fcc 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -56,6 +56,7 @@ class OptimizationRemarkEmitter;
class PHINode;
class TargetLibraryInfo;
class Value;
+class IntrinsicInst;
/// A private "module" namespace for types and utilities used by GVN. These
/// are implementation details and should not be used by clients.
namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn {
@@ -349,6 +350,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
// Helper functions of redundant load elimination.
bool processLoad(LoadInst *L);
+ bool processMaskedLoad(IntrinsicInst *I);
bool processNonLocalLoad(LoadInst *L);
bool processAssumeIntrinsic(AssumeInst *II);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 26e17cc849bff..10325ab7c5737 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -50,6 +50,7 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -2287,6 +2288,50 @@ bool GVNPass::processLoad(LoadInst *L) {
return true;
}
+// Attempt to process masked loads which have loaded from
+// masked stores with the same mask
+bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
+ Value *Mask = I->getOperand(2);
+ Value *Passthrough = I->getOperand(3);
+
+ MemDepResult Dep = MD->getDependency(I);
+ Instruction *DepInst = Dep.getInst();
+ if (!DepInst || !Dep.isLocal())
+ return false;
+
+ auto *MaskedStore = dyn_cast<IntrinsicInst>(DepInst);
+ if (!MaskedStore || MaskedStore->getIntrinsicID() != Intrinsic::masked_store)
+ return false;
+
+ auto StoreMask = MaskedStore->getOperand(3);
+ if (StoreMask != Mask)
+ return false;
+
+ Value *OpToForward =
+ AvailableValue::get(MaskedStore->getOperand(0)).getSimpleValue();
+ if (auto *LoadToForward = dyn_cast<IntrinsicInst>(OpToForward);
+ LoadToForward &&
+ LoadToForward->getIntrinsicID() == Intrinsic::masked_load) {
+ // For MaskedLoad->MaskedStore->MaskedLoad, the mask must be the same for
+ // all three instructions. The Passthrough on the two loads must also be the
+ // same.
+ if (LoadToForward->getOperand(2) != Mask ||
+ LoadToForward->getOperand(3) != Passthrough)
+ return false;
+ } else {
+ // MaskedStore(Op, ptr, mask)->MaskedLoad(ptr, mask, passthrough) can be
+ // replaced with MaskedStore(Op, ptr, mask)->select(mask, Op, passthrough)
+ IRBuilder<> Builder(I);
+ OpToForward = Builder.CreateSelect(StoreMask, OpToForward, Passthrough);
+ }
+
+ I->replaceAllUsesWith(OpToForward);
+ ICF->removeUsersOf(I);
+ salvageAndRemoveInstruction(I);
+ ++NumGVNLoad;
+ return true;
+}
+
/// Return a pair the first field showing the value number of \p Exp and the
/// second field showing whether it is a value number newly created.
std::pair<uint32_t, bool>
@@ -2734,6 +2779,11 @@ bool GVNPass::processInstruction(Instruction *I) {
return false;
}
+ if (auto *II = dyn_cast<IntrinsicInst>(I))
+ if (II && II->getIntrinsicID() == Intrinsic::masked_load)
+ if (processMaskedLoad(II))
+ return true;
+
// For conditional branches, we can perform simple conditional propagation on
// the condition value itself.
if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index 984a756591701..b32279941d0b0 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -36,6 +36,164 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
ret <128 x i8> %v4
}
+define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
+; CHECK-LABEL: @forward_masked_load(
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %6 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %7 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %6, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %1, i32 1, <4 x i1> %6)
+ %8 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %6, <4 x float> zeroinitializer)
+ ret <4 x float> %8
+}
+
+define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
+; CHECK-LABEL: @forward_binop_splat_i1_mask(
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT: ret <4 x float> [[FMUL]]
+;
+ %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fmul = fmul <4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+ %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %load.1.0
+}
+
+define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel(
+; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
+ %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fmul = fmul <4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+ %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough)
+ ret <4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_masked_load_scalable(
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP4]]
+;
+ %6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr %1, i32 1, <vscale x 4 x i1> %6)
+ %8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %8
+}
+
+define <vscale x 4 x float> @bail_on_different_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @bail_on_different_passthrough(
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]]
+;
+ %6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr %1, i32 1, <vscale x 4 x i1> %6)
+ %8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %8
+}
+
+define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel_scalable(
+; CHECK-NEXT: [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP3]]
+;
+ %mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @load_mask_differs(
+; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
+; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
+;
+ %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask0)
+ %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @store_mask_differs(
+; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
+; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
+;
+ %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask1)
+ %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load.1.0
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x float>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr captures(none), i32 immarg, <vscale x 4 x i1>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #3
+
declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>)
declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
>From c335ed60b89b4f63e964637220daff13839e8277 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Wed, 10 Sep 2025 09:24:12 +0000
Subject: [PATCH 2/3] Use PatternMatch
---
llvm/lib/Transforms/Scalar/GVN.cpp | 28 ++++++++---------
llvm/test/Transforms/GVN/masked-load-store.ll | 30 +++++++++----------
2 files changed, 27 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 10325ab7c5737..7b43d922b7caa 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2299,30 +2299,26 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
if (!DepInst || !Dep.isLocal())
return false;
- auto *MaskedStore = dyn_cast<IntrinsicInst>(DepInst);
- if (!MaskedStore || MaskedStore->getIntrinsicID() != Intrinsic::masked_store)
+ Value *StoreVal;
+ if (!match(DepInst,
+ m_Intrinsic<Intrinsic::masked_store>(m_Value(StoreVal), m_Value(),
+ m_Value(), m_Specific(Mask))))
return false;
- auto StoreMask = MaskedStore->getOperand(3);
- if (StoreMask != Mask)
- return false;
-
- Value *OpToForward =
- AvailableValue::get(MaskedStore->getOperand(0)).getSimpleValue();
- if (auto *LoadToForward = dyn_cast<IntrinsicInst>(OpToForward);
- LoadToForward &&
- LoadToForward->getIntrinsicID() == Intrinsic::masked_load) {
+ Value *OpToForward = nullptr;
+ if (match(StoreVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(Mask),
+ m_Specific(Passthrough))))
// For MaskedLoad->MaskedStore->MaskedLoad, the mask must be the same for
// all three instructions. The Passthrough on the two loads must also be the
// same.
- if (LoadToForward->getOperand(2) != Mask ||
- LoadToForward->getOperand(3) != Passthrough)
- return false;
- } else {
+ OpToForward = AvailableValue::get(StoreVal).getSimpleValue();
+ else if (match(StoreVal, m_Intrinsic<Intrinsic::masked_load>()))
+ return false;
+ else {
// MaskedStore(Op, ptr, mask)->MaskedLoad(ptr, mask, passthrough) can be
// replaced with MaskedStore(Op, ptr, mask)->select(mask, Op, passthrough)
IRBuilder<> Builder(I);
- OpToForward = Builder.CreateSelect(StoreMask, OpToForward, Passthrough);
+ OpToForward = Builder.CreateSelect(Mask, StoreVal, Passthrough);
}
I->replaceAllUsesWith(OpToForward);
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index b32279941d0b0..77888476edb7d 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -42,11 +42,11 @@ define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
; CHECK-NEXT: ret <4 x float> [[TMP4]]
;
- %6 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
- %7 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %6, <4 x float> zeroinitializer)
- call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %1, i32 1, <4 x i1> %6)
- %8 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %6, <4 x float> zeroinitializer)
- ret <4 x float> %8
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask)
+ %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %load2
}
define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
@@ -96,11 +96,11 @@ define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscal
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP4]]
;
- %6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
- %7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
- call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr %1, i32 1, <vscale x 4 x i1> %6)
- %8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
- ret <vscale x 4 x float> %8
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load2
}
define <vscale x 4 x float> @bail_on_different_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
@@ -111,11 +111,11 @@ define <vscale x 4 x float> @bail_on_different_passthrough(ptr %0, ptr %1, <vsca
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]]
;
- %6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
- %7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> zeroinitializer)
- call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr %1, i32 1, <vscale x 4 x i1> %6)
- %8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
- ret <vscale x 4 x float> %8
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load2
}
define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
>From 0d7107f2838dc06b6220f96615de1a04cff7d64c Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Wed, 10 Sep 2025 09:51:38 +0000
Subject: [PATCH 3/3] Remove intrinsic declarations
---
llvm/test/Transforms/GVN/masked-load-store.ll | 13 -------------
1 file changed, 13 deletions(-)
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index 77888476edb7d..e9d3223055995 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -184,16 +184,3 @@ define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x fl
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough)
ret <vscale x 4 x float> %load.1.0
}
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
-declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x float>) #1
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
-declare void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr captures(none), i32 immarg, <vscale x 4 x i1>) #2
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #3
-
-declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>)
-declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
-
More information about the llvm-commits
mailing list