[llvm] [IR][AArch64] Add llvm.masked.speculative.load intrinsic (PR #156470)

Thu Oct 30 05:36:11 PDT 2025

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/156470

>From b76e9401d8cd3ce3b4a8f88df7f35ed2efd6a711 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 8 Aug 2025 11:06:12 +0000
Subject: [PATCH] [IR] Add llvm.masked.speculative.load intrinsic

In order to support loading from addresses which may not be valid at
runtime without generating faults, we introduce the speculative load
intrinsic. Loading with this intrinsic will only generate a fault for
invalid accesses on the first element of the vector. Any subsequent
fault will be suppressed and the corresponding data will be poison.

This PR contains target-independent scalarization of the intrinsic so
that generic codegen works.
---
 llvm/docs/LangRef.rst                         | 59 ++++++++++++++
 llvm/include/llvm/IR/Intrinsics.td            |  8 ++
 .../Scalar/ScalarizeMaskedMemIntrin.cpp       | 66 ++++++++++++++++
 .../masked-speculative-load-fixed-vectors.ll  | 79 +++++++++++++++++++
 4 files changed, 212 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1c6823be44dcb..92ea6554c4aba 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -27098,6 +27098,65 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
 
 Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
 
+.. _int_mspecload:
+
+'``llvm.masked.speculative.load.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer, floating-point or pointer data type.
+
+::
+
+      declare { <16 x float>, <16 x i1> } @llvm.masked.speculative.load.v16f32.p0(ptr <ptr>, i32 <alignment>, <16 x i1> <mask>)
+      declare { <2 x double>, <2 x i1> } @llvm.masked.speculative.load.v2f64.p0(ptr <ptr>, i32 <alignment>, <2 x i1> <mask>)
+      ;; The data is a vector of pointers
+      declare { <8 x ptr>, <8 x i1> } @llvm.masked.speculative.load.v8p0.p0(ptr <ptr>, i32 <alignment>, <8 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Reads a vector from memory according to the provided mask, suppressing faults
+for any lane beyond the first. The mask holds a bit for each vector lane, and
+is used to prevent memory accesses to the masked-off lanes. Inactive lanes will
+be zero in the result vector.
+
+Returns the loaded data and a mask indicating which lanes are valid, which may
+not be the same as the input mask depending on whether the processor encountered
+a reason to avoid loading from that address.
+
+Arguments:
+""""""""""
+
+The first argument is the base pointer for the load. The second argument is the
+alignment of the source location. It must be a power of two constant integer
+value. The third argument, mask, is a vector of boolean values with the same
+number of elements as the return type.
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.speculative.load``' intrinsic is similar to the
+'``llvm.masked.load``' intrinsic, in that it conditionally loads values from
+memory into a vector based on a mask. However, it allows loading from addresses
+which may not be entirely safe. If the memory corresponding to the first element
+of the vector is inaccessible, then a fault will be raised as normal. For all
+subsequent lanes faults will be suppressed and the corresponding bit in the
+output mask will be marked inactive. The remaining elements in the output mask
+after a suppressed fault will also be marked inactive. Elements with active bits
+in the input mask will be poison values if the corresponding bit is inactive in
+the output mask.
+
+Reasons for marking output elements inactive are processor dependent; it may be
+a genuine fault, e.g. if the range of the data being loaded spans a page
+boundary and the page at the higher address is not mapped. But a given
+processor may also mark elements as inactive for other reasons, such as a cache
+miss. Code using this intrinsic must take this into account and not assume that
+inactive lanes signal the end of accessible memory. If more data should be
+loaded based on the semantics of the user code, then the base pointer should be
+advanced to the address of the first inactive element and a new speculative load
+attempted.
 
 Memory Use Markers
 ------------------
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4d59ee8676b9e..74fed62fcc1dd 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2505,6 +2505,14 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly,
              NoCapture<ArgIndex<1>>]>;
 
+def int_masked_speculative_load
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty,
+                             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyptr_ty, llvm_i32_ty,
+                             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
+                             NoCapture<ArgIndex<0>>]>;
+
 def int_experimental_vector_compress:
     DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 146e7d1047dd0..531d79f89f7cd 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -1031,6 +1031,68 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
   ModifiedDT = true;
 }
 
+static void scalarizeMaskedSpeculativeLoad(const DataLayout &DL, CallInst *CI,
+                                           DomTreeUpdater *DTU,
+                                           bool &ModifiedDT) {
+  // For a target without speculative/first-faulting load support, we can't
+  // actually scalarize accesses for all lanes. However, lanes beyond the
+  // first may be considered inactive due to reasons beyond a fault, so for
+  // generic 'scalarization' we can just load the first lane (if the
+  // corresponding input mask bit is active), then mark all other lanes as
+  // inactive in the output mask and embed the first lane into a vector of
+  // poison.
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Align = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  StructType *RetTy = cast<StructType>(CI->getType());
+  VectorType *DataTy = cast<VectorType>(RetTy->getElementType(0));
+  VectorType *MaskTy = cast<VectorType>(RetTy->getElementType(1));
+  Type *ScalarTy = DataTy->getScalarType();
+
+  MaybeAlign AlignVal = cast<ConstantInt>(Align)->getMaybeAlignValue();
+
+  IRBuilder<> Builder(CI->getContext());
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(CI);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  Value *EmptyMask = Constant::getNullValue(MaskTy);
+  Value *PoisonData = PoisonValue::get(DataTy);
+
+  // FIXME: If the mask is a constant, we can skip the extract.
+  Value *FirstActive =
+      Builder.CreateExtractElement(Mask, 0ul, Twine("first.active"));
+  Instruction *ThenTerm =
+      SplitBlockAndInsertIfThen(FirstActive, CI,
+                                /*Unreachable=*/false,
+                                /*BranchWeights=*/nullptr, DTU);
+
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  ThenBlock->setName("speculative.load.first.lane");
+  Builder.SetInsertPoint(ThenBlock->getTerminator());
+  LoadInst *Load = Builder.CreateAlignedLoad(ScalarTy, Ptr, AlignVal);
+  Value *OneLaneData = Builder.CreateInsertElement(PoisonData, Load, 0ul);
+  Value *OneLaneMask = Builder.CreateInsertElement(
+      EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()), 0ul);
+
+  Builder.SetInsertPoint(CI);
+  PHINode *ResData = Builder.CreatePHI(DataTy, 2);
+  ResData->addIncoming(PoisonData, IfBlock);
+  ResData->addIncoming(OneLaneData, ThenBlock);
+  PHINode *ResMask = Builder.CreatePHI(MaskTy, 2);
+  ResMask->addIncoming(EmptyMask, IfBlock);
+  ResMask->addIncoming(OneLaneMask, ThenBlock);
+
+  Value *Result = PoisonValue::get(RetTy);
+  Result = Builder.CreateInsertValue(Result, ResData, 0ul);
+  Result = Builder.CreateInsertValue(Result, ResMask, 1ul);
+  if (CI->hasName())
+    Result->setName(CI->getName() + ".first.lane.only");
+  CI->getParent()->setName("speculative.result");
+  CI->replaceAllUsesWith(Result);
+  CI->eraseFromParent();
+  ModifiedDT = true;
+}
+
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
                     DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
@@ -1170,8 +1232,12 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
       scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU,
                                    ModifiedDT);
       return true;
+    case Intrinsic::masked_speculative_load: {
+      scalarizeMaskedSpeculativeLoad(DL, CI, DTU, ModifiedDT);
+      return true;
     }
   }
+  }
 
   return false;
 }
diff --git a/llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll b/llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll
new file mode 100644
index 0000000000000..e8facb8a5683b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=NEON
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=SVE
+
+define { <4 x i32>, <4 x i1> } @speculative_load_v4i32(ptr %p, <4 x i1> %mask) {
+; NEON-LABEL: speculative_load_v4i32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NEON-NEXT:    umov w8, v0.h[0]
+; NEON-NEXT:    tbz w8, #0, .LBB0_2
+; NEON-NEXT:  // %bb.1: // %speculative.load.first.lane
+; NEON-NEXT:    adrp x8, .LCPI0_0
+; NEON-NEXT:    ldr s0, [x0]
+; NEON-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; NEON-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NEON-NEXT:    ret
+; NEON-NEXT:  .LBB0_2:
+; NEON-NEXT:    movi v1.2d, #0000000000000000
+; NEON-NEXT:    // implicit-def: $q0
+; NEON-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: speculative_load_v4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SVE-NEXT:    umov w8, v0.h[0]
+; SVE-NEXT:    tbz w8, #0, .LBB0_2
+; SVE-NEXT:  // %bb.1: // %speculative.load.first.lane
+; SVE-NEXT:    adrp x8, .LCPI0_0
+; SVE-NEXT:    ldr s0, [x0]
+; SVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; SVE-NEXT:    ret
+; SVE-NEXT:  .LBB0_2:
+; SVE-NEXT:    movi v1.2d, #0000000000000000
+; SVE-NEXT:    // implicit-def: $q0
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; SVE-NEXT:    ret
+  %res = call { <4 x i32>, <4 x i1> } @llvm.masked.speculative.load.v4i32.p0(ptr %p, i32 16, <4 x i1> %mask)
+  ret { <4 x i32>, <4 x i1> } %res
+}
+
+;; FIXME: If we know the input mask is all-true and the vector is fully aligned,
+;;        we should be able to use a normal NEON load here.
+define { <2 x double>, <2 x i1> } @speculative_load_v2f64_all_true_fully_aligned(ptr %p) {
+; NEON-LABEL: speculative_load_v2f64_all_true_fully_aligned:
+; NEON:       // %bb.0: // %speculative.load.first.lane
+; NEON-NEXT:    adrp x8, .LCPI1_0
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: speculative_load_v2f64_all_true_fully_aligned:
+; SVE:       // %bb.0: // %speculative.load.first.lane
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    index z1.s, #1, #-1
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; SVE-NEXT:    ret
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.speculative.load.v2f64.p0(ptr %p, i32 16, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}
+
+define { <2 x double>, <2 x i1> } @speculative_load_v2f64_all_true_partially_aligned(ptr %p) {
+; NEON-LABEL: speculative_load_v2f64_all_true_partially_aligned:
+; NEON:       // %bb.0: // %speculative.load.first.lane
+; NEON-NEXT:    adrp x8, .LCPI2_0
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    ldr d1, [x8, :lo12:.LCPI2_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: speculative_load_v2f64_all_true_partially_aligned:
+; SVE:       // %bb.0: // %speculative.load.first.lane
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    index z1.s, #1, #-1
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; SVE-NEXT:    ret
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.speculative.load.v2f64.p0(ptr %p, i32 8, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}