[llvm] [IR][AArch64] Add llvm.masked.speculative.load intrinsic (PR #156470)

Thu Oct 30 05:36:44 PDT 2025

llvmbot wrote:



@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-aarch64

Author: Graham Hunter (huntergr-arm)

<details>
<summary>Changes</summary>

In order to support loading from addresses which may not be valid at
runtime without generating faults, we introduce the speculative load
intrinsic. Loading with this intrinsic will only generate a fault for
invalid accesses on the first element of the vector. Any subsequent
fault will be suppressed and the corresponding data will be poison.

This PR contains both target-independent and AArch64-specific codegen
for the intrinsic.


---
Full diff: https://github.com/llvm/llvm-project/pull/156470.diff


4 Files Affected:

- (modified) llvm/docs/LangRef.rst (+59) 
- (modified) llvm/include/llvm/IR/Intrinsics.td (+8) 
- (modified) llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp (+66) 
- (added) llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll (+79) 


``````````diff

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1c6823be44dcb..92ea6554c4aba 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -27098,6 +27098,65 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
 
 Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
 
+.. _int_mspecload:
+
+'``llvm.masked.speculative.load.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer, floating-point or pointer data type.
+
+::
+
+      declare { <16 x float>, <16 x i1> } @llvm.masked.speculative.load.v16f32.p0(ptr <ptr>, i32 <alignment>, <16 x i1> <mask>)
+      declare { <2 x double>, <2 x i1> } @llvm.masked.speculative.load.v2f64.p0(ptr <ptr>, i32 <alignment>, <2 x i1> <mask>)
+      ;; The data is a vector of pointers
+      declare { <8 x ptr>, <8 x i1> } @llvm.masked.speculative.load.v8p0.p0(ptr <ptr>, i32 <alignment>, <8 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Reads a vector from memory according to the provided mask, suppressing faults
+for any lane beyond the first. The mask holds a bit for each vector lane, and
+is used to prevent memory accesses to the masked-off lanes. Inactive lanes will
+be zero in the result vector.
+
+Returns the loaded data and a mask indicating which lanes are valid, which may
+not be the same as the input mask depending on whether the processor encountered
+a reason to avoid loading from that address.
+
+Arguments:
+""""""""""
+
+The first argument is the base pointer for the load. The second argument is the
+alignment of the source location. It must be a power of two constant integer
+value. The third argument, mask, is a vector of boolean values with the same
+number of elements as the return type.
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.speculative.load``' intrinsic is similar to the
+'``llvm.masked.load``' intrinsic, in that it conditionally loads values from
+memory into a vector based on a mask. However, it allows loading from addresses
+which may not be entirely safe. If the memory corresponding to the first element
+of the vector is inaccessible, then a fault will be raised as normal. For all
+subsequent lanes faults will be suppressed and the corresponding bit in the
+output mask will be marked inactive. The remaining elements in the output mask
+after a suppressed fault will also be marked inactive. Elements with active bits
+in the input mask will be poison values if the corresponding bit is inactive in
+the output mask.
+
+Reasons for marking output elements inactive are processor dependent; it may be
+a genuine fault, e.g. if the range of the data being loaded spans a page
+boundary and the page at the higher address is not mapped. But a given
+processor may also mark elements as inactive for other reasons, such as a cache
+miss. Code using this intrinsic must take this into account and not assume that
+inactive lanes signal the end of accessible memory. If more data should be
+loaded based on the semantics of the user code, then the base pointer should be
+advanced to the address of the first inactive element and a new speculative load
+attempted.
 
 Memory Use Markers
 ------------------
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4d59ee8676b9e..74fed62fcc1dd 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2505,6 +2505,14 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly,
              NoCapture<ArgIndex<1>>]>;
 
+def int_masked_speculative_load
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty,
+                             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyptr_ty, llvm_i32_ty,
+                             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
+                             NoCapture<ArgIndex<0>>]>;
+
 def int_experimental_vector_compress:
     DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 146e7d1047dd0..531d79f89f7cd 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -1031,6 +1031,68 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
   ModifiedDT = true;
 }
 
+static void scalarizeMaskedSpeculativeLoad(const DataLayout &DL, CallInst *CI,
+                                           DomTreeUpdater *DTU,
+                                           bool &ModifiedDT) {
+  // For a target without speculative/first-faulting load support, we can't
+  // actually scalarize accesses for all lanes. However, lanes beyond the
+  // first may be considered inactive due to reasons beyond a fault, so for
+  // generic 'scalarization' we can just load the first lane (if the
+  // corresponding input mask bit is active), then mark all other lanes as
+  // inactive in the output mask and embed the first lane into a vector of
+  // poison.
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Align = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  StructType *RetTy = cast<StructType>(CI->getType());
+  VectorType *DataTy = cast<VectorType>(RetTy->getElementType(0));
+  VectorType *MaskTy = cast<VectorType>(RetTy->getElementType(1));
+  Type *ScalarTy = DataTy->getScalarType();
+
+  MaybeAlign AlignVal = cast<ConstantInt>(Align)->getMaybeAlignValue();
+
+  IRBuilder<> Builder(CI->getContext());
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(CI);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  Value *EmptyMask = Constant::getNullValue(MaskTy);
+  Value *PoisonData = PoisonValue::get(DataTy);
+
+  // FIXME: If the mask is a constant, we can skip the extract.
+  Value *FirstActive =
+      Builder.CreateExtractElement(Mask, 0ul, Twine("first.active"));
+  Instruction *ThenTerm =
+      SplitBlockAndInsertIfThen(FirstActive, CI,
+                                /*Unreachable=*/false,
+                                /*BranchWeights=*/nullptr, DTU);
+
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  ThenBlock->setName("speculative.load.first.lane");
+  Builder.SetInsertPoint(ThenBlock->getTerminator());
+  LoadInst *Load = Builder.CreateAlignedLoad(ScalarTy, Ptr, AlignVal);
+  Value *OneLaneData = Builder.CreateInsertElement(PoisonData, Load, 0ul);
+  Value *OneLaneMask = Builder.CreateInsertElement(
+      EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()), 0ul);
+
+  Builder.SetInsertPoint(CI);
+  PHINode *ResData = Builder.CreatePHI(DataTy, 2);
+  ResData->addIncoming(PoisonData, IfBlock);
+  ResData->addIncoming(OneLaneData, ThenBlock);
+  PHINode *ResMask = Builder.CreatePHI(MaskTy, 2);
+  ResMask->addIncoming(EmptyMask, IfBlock);
+  ResMask->addIncoming(OneLaneMask, ThenBlock);
+
+  Value *Result = PoisonValue::get(RetTy);
+  Result = Builder.CreateInsertValue(Result, ResData, 0ul);
+  Result = Builder.CreateInsertValue(Result, ResMask, 1ul);
+  if (CI->hasName())
+    Result->setName(CI->getName() + ".first.lane.only");
+  CI->getParent()->setName("speculative.result");
+  CI->replaceAllUsesWith(Result);
+  CI->eraseFromParent();
+  ModifiedDT = true;
+}
+
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
                     DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
@@ -1170,8 +1232,12 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
       scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU,
                                    ModifiedDT);
       return true;
+    case Intrinsic::masked_speculative_load: {
+      scalarizeMaskedSpeculativeLoad(DL, CI, DTU, ModifiedDT);
+      return true;
     }
   }
+  }
 
   return false;
 }
diff --git a/llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll b/llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll
new file mode 100644
index 0000000000000..e8facb8a5683b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-speculative-load-fixed-vectors.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=NEON
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=SVE
+
+define { <4 x i32>, <4 x i1> } @speculative_load_v4i32(ptr %p, <4 x i1> %mask) {
+; NEON-LABEL: speculative_load_v4i32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NEON-NEXT:    umov w8, v0.h[0]
+; NEON-NEXT:    tbz w8, #0, .LBB0_2
+; NEON-NEXT:  // %bb.1: // %speculative.load.first.lane
+; NEON-NEXT:    adrp x8, .LCPI0_0
+; NEON-NEXT:    ldr s0, [x0]
+; NEON-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; NEON-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NEON-NEXT:    ret
+; NEON-NEXT:  .LBB0_2:
+; NEON-NEXT:    movi v1.2d, #0000000000000000
+; NEON-NEXT:    // implicit-def: $q0
+; NEON-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: speculative_load_v4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SVE-NEXT:    umov w8, v0.h[0]
+; SVE-NEXT:    tbz w8, #0, .LBB0_2
+; SVE-NEXT:  // %bb.1: // %speculative.load.first.lane
+; SVE-NEXT:    adrp x8, .LCPI0_0
+; SVE-NEXT:    ldr s0, [x0]
+; SVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; SVE-NEXT:    ret
+; SVE-NEXT:  .LBB0_2:
+; SVE-NEXT:    movi v1.2d, #0000000000000000
+; SVE-NEXT:    // implicit-def: $q0
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; SVE-NEXT:    ret
+  %res = call { <4 x i32>, <4 x i1> } @llvm.masked.speculative.load.v4i32.p0(ptr %p, i32 16, <4 x i1> %mask)
+  ret { <4 x i32>, <4 x i1> } %res
+}
+
+;; FIXME: If we know the input mask is all-true and the vector is fully aligned,
+;;        we should be able to use a normal NEON load here.
+define { <2 x double>, <2 x i1> } @speculative_load_v2f64_all_true_fully_aligned(ptr %p) {
+; NEON-LABEL: speculative_load_v2f64_all_true_fully_aligned:
+; NEON:       // %bb.0: // %speculative.load.first.lane
+; NEON-NEXT:    adrp x8, .LCPI1_0
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: speculative_load_v2f64_all_true_fully_aligned:
+; SVE:       // %bb.0: // %speculative.load.first.lane
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    index z1.s, #1, #-1
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; SVE-NEXT:    ret
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.speculative.load.v2f64.p0(ptr %p, i32 16, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}
+
+define { <2 x double>, <2 x i1> } @speculative_load_v2f64_all_true_partially_aligned(ptr %p) {
+; NEON-LABEL: speculative_load_v2f64_all_true_partially_aligned:
+; NEON:       // %bb.0: // %speculative.load.first.lane
+; NEON-NEXT:    adrp x8, .LCPI2_0
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    ldr d1, [x8, :lo12:.LCPI2_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: speculative_load_v2f64_all_true_partially_aligned:
+; SVE:       // %bb.0: // %speculative.load.first.lane
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    index z1.s, #1, #-1
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; SVE-NEXT:    ret
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.speculative.load.v2f64.p0(ptr %p, i32 8, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/156470