[llvm] 89f53af - [ConstraintElim] Use constraints from bounded memory accesses (#155253)

Tue Sep 2 06:41:06 PDT 2025

Author: Yingwei Zheng
Date: 2025-09-02T21:41:02+08:00
New Revision: 89f53af3fffed3e41167fbb7bc10d4885cd97c7f

URL: https://github.com/llvm/llvm-project/commit/89f53af3fffed3e41167fbb7bc10d4885cd97c7f
DIFF: https://github.com/llvm/llvm-project/commit/89f53af3fffed3e41167fbb7bc10d4885cd97c7f.diff

LOG: [ConstraintElim] Use constraints from bounded memory accesses (#155253)

This patch removes bound checks that are dominated by bounded memory
accesses. For example, if we have an array `int A[5]` and `A[idx]` is
performed successfully, we know that `idx u< 5` after the load.

compile-time impact (+0.1%):
https://llvm-compile-time-tracker.com/compare.php?from=f0e9bba024d44b55d54b02025623ce4a3ba5a37c&to=5227b08a4a514159ec524d1b1ca18ed8ab5407df&stat=instructions%3Au
llvm-opt-benchmark:
https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2709

Proof: https://alive2.llvm.org/ce/z/JEyjA2

Added: 
    llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll

Modified: 
    llvm/lib/Transforms/Scalar/ConstraintElimination.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 1ddb8ae9518fc..1b4d8c786cbb2 100644

--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -19,9 +19,11 @@
 #include "llvm/Analysis/ConstraintSystem.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -170,10 +172,12 @@ struct State {
   DominatorTree &DT;
   LoopInfo &LI;
   ScalarEvolution &SE;
+  TargetLibraryInfo &TLI;
   SmallVector<FactOrCheck, 64> WorkList;
 
-  State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE)
-      : DT(DT), LI(LI), SE(SE) {}
+  State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE,
+        TargetLibraryInfo &TLI)
+      : DT(DT), LI(LI), SE(SE), TLI(TLI) {}
 
   /// Process block \p BB and add known facts to work-list.
   void addInfoFor(BasicBlock &BB);
@@ -1109,10 +1113,50 @@ void State::addInfoForInductions(BasicBlock &BB) {
   }
 }
 
+static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP,
+                                          uint64_t AccessSize,
+                                          CmpPredicate &Pred, Value *&A,
+                                          Value *&B, const DataLayout &DL,
+                                          const TargetLibraryInfo &TLI) {
+  auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL);
+  if (!Offset.NW.hasNoUnsignedWrap())
+    return false;
+
+  if (Offset.VariableOffsets.size() != 1)
+    return false;
+
+  ObjectSizeOpts Opts;
+  // Workaround for gep inbounds, ptr null, idx.
+  Opts.NullIsUnknownSize = true;
+  // Be conservative since we are not clear on whether an out of bounds access
+  // to the padding is UB or not.
+  Opts.RoundToAlign = true;
+  std::optional<TypeSize> Size =
+      getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts);
+  if (!Size || Size->isScalable())
+    return false;
+
+  // Index * Scale + ConstOffset + AccessSize <= AllocSize
+  // With nuw flag, we know that the index addition doesn't have unsigned wrap.
+  // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid
+  // value for Index.
+  uint64_t BitWidth = Offset.ConstantOffset.getBitWidth();
+  auto &[Index, Scale] = Offset.VariableOffsets.front();
+  APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize,
+                          /*isSigned=*/false, /*implicitTrunc=*/true) -
+                    Offset.ConstantOffset)
+                       .udiv(Scale);
+  Pred = ICmpInst::ICMP_ULE;
+  A = Index;
+  B = ConstantInt::get(Index->getType(), MaxIndex);
+  return true;
+}
+
 void State::addInfoFor(BasicBlock &BB) {
   addInfoForInductions(BB);
+  auto &DL = BB.getDataLayout();
 
-  // True as long as long as the current instruction is guaranteed to execute.
+  // True as long as the current instruction is guaranteed to execute.
   bool GuaranteedToExecute = true;
   // Queue conditions and assumes.
   for (Instruction &I : BB) {
@@ -1127,6 +1171,38 @@ void State::addInfoFor(BasicBlock &BB) {
       continue;
     }
 
+    auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) {
+      auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+      if (!GEP)
+        return;
+      TypeSize AccessSize = DL.getTypeStoreSize(AccessType);
+      if (!AccessSize.isFixed())
+        return;
+      if (GuaranteedToExecute) {
+        CmpPredicate Pred;
+        Value *A, *B;
+        if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(),
+                                          Pred, A, B, DL, TLI)) {
+          // The memory access is guaranteed to execute when BB is entered,
+          // hence the constraint holds on entry to BB.
+          WorkList.emplace_back(FactOrCheck::getConditionFact(
+              DT.getNode(I.getParent()), Pred, A, B));
+        }
+      } else {
+        WorkList.emplace_back(
+            FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
+      }
+    };
+
+    if (auto *LI = dyn_cast<LoadInst>(&I)) {
+      if (!LI->isVolatile())
+        AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType());
+    }
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isVolatile())
+        AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType());
+    }
+
     auto *II = dyn_cast<IntrinsicInst>(&I);
     Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
     switch (ID) {
@@ -1420,7 +1496,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A,
   LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n");
 
   auto R = Info.getConstraintForSolving(Pred, A, B);
-  if (R.empty() || !R.isValid(Info)){
+  if (R.empty() || !R.isValid(Info)) {
     LLVM_DEBUG(dbgs() << "   failed to decompose condition\n");
     return std::nullopt;
   }
@@ -1785,12 +1861,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
 
 static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
                                  ScalarEvolution &SE,
-                                 OptimizationRemarkEmitter &ORE) {
+                                 OptimizationRemarkEmitter &ORE,
+                                 TargetLibraryInfo &TLI) {
   bool Changed = false;
   DT.updateDFSNumbers();
   SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args()));
   ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
-  State S(DT, LI, SE);
+  State S(DT, LI, SE, TLI);
   std::unique_ptr<Module> ReproducerModule(
       DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
 
@@ -1960,6 +2037,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
         }
         continue;
       }
+
+      auto &DL = F.getDataLayout();
+      auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) {
+        CmpPredicate Pred;
+        Value *A, *B;
+        if (getConstraintFromMemoryAccess(
+                *cast<GetElementPtrInst>(Ptr),
+                DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL,
+                TLI))
+          AddFact(Pred, A, B);
+      };
+
+      if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) {
+        AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType());
+        continue;
+      }
+      if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) {
+        AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType());
+        continue;
+      }
     }
 
     Value *A = nullptr, *B = nullptr;
@@ -2018,7 +2115,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F,
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  if (!eliminateConstraints(F, DT, LI, SE, ORE))
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;

diff  --git a/llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll b/llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll
new file mode 100644
index 0000000000000..8e3862b5714d0
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+ at g = private unnamed_addr constant [5 x i8] c"test\00"
+ at g_overaligned = private unnamed_addr constant [5 x i8] c"test\00", align 8
+ at g_external = external global [5 x i8]
+
+declare void @free(ptr allocptr noundef captures(none)) mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc"
+declare ptr @malloc(i64) mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) memory(inaccessiblemem: readwrite) "alloc-family"="malloc"
+declare void @may_not_return(i1)
+
+define i8 @load_global(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_global_const_offset(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_const_offset(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr nuw i8, ptr @g, i64 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[GEP1]], i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep1 = getelementptr nuw i8, ptr @g, i64 1
+  %gep = getelementptr nuw i8, ptr %gep1, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 4
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_global_atomic(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_atomic(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load atomic i8, ptr [[GEP]] unordered, align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  %load = load atomic i8, ptr %gep unordered, align 1
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i1 @store_global(i64 %idx) {
+; CHECK-LABEL: define i1 @store_global(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP]], align 1
+; CHECK-NEXT:    ret i1 true
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  store i8 0, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  ret i1 %cmp
+}
+
+define i1 @store_global_atomic(i64 %idx) {
+; CHECK-LABEL: define i1 @store_global_atomic(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    store atomic i8 0, ptr [[GEP]] release, align 1
+; CHECK-NEXT:    ret i1 true
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  store atomic i8 0, ptr %gep release, align 1
+  %cmp = icmp ult i64 %idx, 5
+  ret i1 %cmp
+}
+
+define i8 @load_byval(ptr byval([5 x i8]) %p, i64 %idx) {
+; CHECK-LABEL: define i8 @load_byval(
+; CHECK-SAME: ptr byval([5 x i8]) [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr %p, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_alloca(i64 %idx) {
+; CHECK-LABEL: define i8 @load_alloca(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca [5 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOC]], ptr @g, i64 5, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[ALLOC]], i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %alloc = alloca [5 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr %alloc, ptr @g, i64 5, i1 false)
+  %gep = getelementptr nuw i8, ptr %alloc, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_malloc(i64 %idx) {
+; CHECK-LABEL: define i8 @load_malloc(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 5)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOC]], ptr @g, i64 5, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[ALLOC]], i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    call void @free(ptr [[ALLOC]])
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %alloc = call ptr @malloc(i64 5)
+  call void @llvm.memcpy.p0.p0.i64(ptr %alloc, ptr @g, i64 5, i1 false)
+  %gep = getelementptr nuw i8, ptr %alloc, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  call void @free(ptr %alloc)
+  ret i8 %add
+}
+
+define i32 @load_byval_i32(ptr byval([10 x i8]) %p, i64 %idx) {
+; CHECK-LABEL: define i32 @load_byval_i32(
+; CHECK-SAME: ptr byval([10 x i8]) [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr %p, i64 %idx
+  %load = load i32, ptr %gep
+  %cmp = icmp ult i64 %idx, 7
+  %zext = zext i1 %cmp to i32
+  %add = add i32 %load, %zext
+  ret i32 %add
+}
+
+define i8 @load_global_may_noreturn_dom_bb(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_may_noreturn_dom_bb(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    call void @may_not_return(i1 [[CMP1]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    br label %[[NEXT:.*]]
+; CHECK:       [[NEXT]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  %cmp1 = icmp ult i64 %idx, 5
+  call void @may_not_return(i1 %cmp1) ; %cmp1 should not be simplified.
+  %load = load i8, ptr %gep
+  br label %next
+
+next:
+  %cmp2 = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp2 to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+; Negative tests.
+
+define i8 @load_global_overaligned(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_overaligned(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g_overaligned, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g_overaligned, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_global_external(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_external(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g_external, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g_external, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_from_non_gep(ptr %p, i64 %idx) {
+; CHECK-LABEL: define i8 @load_from_non_gep(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %load = load i8, ptr %p
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_global_multi_indices(i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: define i8 @load_global_multi_indices(
+; CHECK-SAME: i64 [[IDX1:%.*]], i64 [[IDX2:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr nuw i8, ptr [[GEP1]], i64 [[IDX2]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP2]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX1]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep1 = getelementptr nuw i8, ptr @g, i64 %idx1
+  %gep2 = getelementptr nuw i8, ptr %gep1, i64 %idx2
+  %load = load i8, ptr %gep2
+  %cmp = icmp ult i64 %idx1, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i8 @load_global_without_nuw(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_without_nuw(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr i8, ptr @g, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i32 @load_byval_i32_smaller_range(ptr byval([10 x i8]) %p, i64 %idx) {
+; CHECK-LABEL: define i32 @load_byval_i32_smaller_range(
+; CHECK-SAME: ptr byval([10 x i8]) [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 6
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr %p, i64 %idx
+  %load = load i32, ptr %gep
+  %cmp = icmp ult i64 %idx, 6
+  %zext = zext i1 %cmp to i32
+  %add = add i32 %load, %zext
+  ret i32 %add
+}
+
+define i8 @load_global_volatile(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_volatile(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  %load = load volatile i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}
+
+define i1 @store_global_volatile(i64 %idx) {
+; CHECK-LABEL: define i1 @store_global_volatile(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    store volatile i8 0, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  store volatile i8 0, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  ret i1 %cmp
+}
+
+define i8 @load_global_vscale(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_vscale(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 1 x i8>, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <vscale x 1 x i8> [[LOAD]], i64 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[EXT]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr @g, i64 %idx
+  %load = load <vscale x 1 x i8>, ptr %gep
+  %ext = extractelement <vscale x 1 x i8> %load, i64 0
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %ext, %zext
+  ret i8 %add
+}
+
+define i8 @load_from_null(i64 %idx) {
+; CHECK-LABEL: define i8 @load_from_null(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr null, i64 [[IDX]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %gep = getelementptr nuw i8, ptr null, i64 %idx
+  %load = load i8, ptr %gep
+  %cmp = icmp ult i64 %idx, 5
+  %zext = zext i1 %cmp to i8
+  %add = add i8 %load, %zext
+  ret i8 %add
+}