[llvm] 369be31 - [X86, SimplifyCFG] Support conditional faulting load or store only (#132032)

via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 21 06:19:50 PDT 2025


Author: Phoebe Wang
Date: 2025-03-21T21:19:46+08:00
New Revision: 369be311a7b8344699d12719a8fa557fe8934e46

URL: https://github.com/llvm/llvm-project/commit/369be311a7b8344699d12719a8fa557fe8934e46
DIFF: https://github.com/llvm/llvm-project/commit/369be311a7b8344699d12719a8fa557fe8934e46.diff

LOG: [X86,SimplifyCFG] Support conditional faulting load or store only (#132032)

This is to fix a bug when a target only support conditional faulting
load, see test case hoist_store_without_cstore.

Split `-simplifycfg-hoist-loads-stores-with-cond-faulting` into
`-simplifycfg-hoist-loads-with-cond-faulting` and
`-simplifycfg-hoist-stores-with-cond-faulting` to control conditional
faulting load and store respectively.

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/SimplifyCFG.cpp
    llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2de966e00542d..02f1d08759129 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -120,11 +120,13 @@ static cl::opt<bool>
     HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
                 cl::desc("Hoist common instructions up to the parent block"));
 
-static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
-    "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
-    cl::init(true),
-    cl::desc("Hoist loads/stores if the target supports "
-             "conditional faulting"));
+static cl::opt<bool> HoistLoadsWithCondFaulting(
+    "simplifycfg-hoist-loads-with-cond-faulting", cl::Hidden, cl::init(true),
+    cl::desc("Hoist loads if the target supports conditional faulting"));
+
+static cl::opt<bool> HoistStoresWithCondFaulting(
+    "simplifycfg-hoist-stores-with-cond-faulting", cl::Hidden, cl::init(true),
+    cl::desc("Hoist stores if the target supports conditional faulting"));
 
 static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold(
     "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6),
@@ -1682,22 +1684,22 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
 static void hoistConditionalLoadsStores(
     BranchInst *BI,
     SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores,
-    std::optional<bool> Invert) {
+    std::optional<bool> Invert, Instruction *Sel) {
   auto &Context = BI->getParent()->getContext();
   auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
   auto *Cond = BI->getOperand(0);
   // Construct the condition if needed.
   BasicBlock *BB = BI->getParent();
-  IRBuilder<> Builder(
-      Invert.has_value() ? SpeculatedConditionalLoadsStores.back() : BI);
   Value *Mask = nullptr;
   Value *MaskFalse = nullptr;
   Value *MaskTrue = nullptr;
   if (Invert.has_value()) {
+    IRBuilder<> Builder(Sel ? Sel : SpeculatedConditionalLoadsStores.back());
     Mask = Builder.CreateBitCast(
         *Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
         VCondTy);
   } else {
+    IRBuilder<> Builder(BI);
     MaskFalse = Builder.CreateBitCast(
         Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
     MaskTrue = Builder.CreateBitCast(Cond, VCondTy);
@@ -1723,13 +1725,20 @@ static void hoistConditionalLoadsStores(
       PHINode *PN = nullptr;
       Value *PassThru = nullptr;
       if (Invert.has_value())
-        for (User *U : I->users())
+        for (User *U : I->users()) {
           if ((PN = dyn_cast<PHINode>(U))) {
             PassThru = Builder.CreateBitCast(
                 PeekThroughBitcasts(PN->getIncomingValueForBlock(BB)),
                 FixedVectorType::get(Ty, 1));
-            break;
+          } else if (auto *Ins = cast<Instruction>(U);
+                     Sel && Ins->getParent() == BB) {
+            // This happens when store or/and a speculative instruction between
+            // load and store were hoisted to the BB. Make sure the masked load
+            // inserted before its use.
+            // We assume there's one of such use.
+            Builder.SetInsertPoint(Ins);
           }
+        }
       MaskedLoadStore = Builder.CreateMaskedLoad(
           FixedVectorType::get(Ty, 1), Op0, LI->getAlign(), Mask, PassThru);
       Value *NewLoadStore = Builder.CreateBitCast(MaskedLoadStore, Ty);
@@ -1770,10 +1779,10 @@ static bool isSafeCheapLoadStore(const Instruction *I,
   // Not handle volatile or atomic.
   bool IsStore = false;
   if (auto *L = dyn_cast<LoadInst>(I)) {
-    if (!L->isSimple())
+    if (!L->isSimple() || !HoistLoadsWithCondFaulting)
       return false;
   } else if (auto *S = dyn_cast<StoreInst>(I)) {
-    if (!S->isSimple())
+    if (!S->isSimple() || !HoistStoresWithCondFaulting)
       return false;
     IsStore = true;
   } else
@@ -3214,8 +3223,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
 
   unsigned SpeculatedInstructions = 0;
-  bool HoistLoadsStores = HoistLoadsStoresWithCondFaulting &&
-                          Options.HoistLoadsStoresWithCondFaulting;
+  bool HoistLoadsStores = Options.HoistLoadsStoresWithCondFaulting;
   SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
   Value *SpeculatedStoreValue = nullptr;
   StoreInst *SpeculatedStore = nullptr;
@@ -3310,6 +3318,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // If we get here, we can hoist the instruction and if-convert.
   LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
+  Instruction *Sel = nullptr;
   // Insert a select of the value of the speculated store.
   if (SpeculatedStoreValue) {
     IRBuilder<NoFolder> Builder(BI);
@@ -3320,6 +3329,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
       std::swap(TrueV, FalseV);
     Value *S = Builder.CreateSelect(
         BrCond, TrueV, FalseV, "spec.store.select", BI);
+    Sel = cast<Instruction>(S);
     SpeculatedStore->setOperand(0, S);
     SpeculatedStore->applyMergedLocation(BI->getDebugLoc(),
                                          SpeculatedStore->getDebugLoc());
@@ -3392,7 +3402,8 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
              std::prev(ThenBB->end()));
 
   if (!SpeculatedConditionalLoadsStores.empty())
-    hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores, Invert);
+    hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores, Invert,
+                                Sel);
 
   // Insert selects and rewrite the PHI operands.
   IRBuilder<NoFolder> Builder(BI);
@@ -8020,8 +8031,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
           hoistCommonCodeFromSuccessors(BI, !Options.HoistCommonInsts))
         return requestResimplify();
 
-      if (BI && HoistLoadsStoresWithCondFaulting &&
-          Options.HoistLoadsStoresWithCondFaulting &&
+      if (BI && Options.HoistLoadsStoresWithCondFaulting &&
           isProfitableToSpeculate(BI, std::nullopt, TTI)) {
         SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
         auto CanSpeculateConditionalLoadsStores = [&]() {
@@ -8044,7 +8054,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
 
         if (CanSpeculateConditionalLoadsStores()) {
           hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores,
-                                      std::nullopt);
+                                      std::nullopt, nullptr);
           return requestResimplify();
         }
       }

diff  --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 5c9058b482320..100806612dffc 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -1,24 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefixes=CHECK,LOADSTORE
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-hoist-loads-with-cond-faulting=false -S | FileCheck %s --check-prefixes=CHECK,NONE,STOREONLY
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-hoist-stores-with-cond-faulting=false -S | FileCheck %s --check-prefixes=CHECK,NONE,LOADONLY
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -simplifycfg-hoist-stores-with-cond-faulting=false -simplifycfg-hoist-loads-with-cond-faulting=false -S | FileCheck %s --check-prefixes=CHECK,NONE,NONEONLY
 
 ;; Basic case: check masked.load/store is generated for i16/i32/i64.
 define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
-; CHECK-LABEL: @basic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    ret void
+; LOADSTORE-LABEL: @basic(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; LOADSTORE-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; LOADSTORE-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; LOADSTORE-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    ret void
+;
+; NONE-LABEL: @basic(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_END:%.*]]
+; NONE:       if.true:
+; NONE-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; NONE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; NONE-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8
+; NONE-NEXT:    store i16 [[TMP0]], ptr [[B]], align 2
+; NONE-NEXT:    store i32 [[TMP1]], ptr [[P]], align 4
+; NONE-NEXT:    store i64 [[TMP2]], ptr [[Q]], align 8
+; NONE-NEXT:    br label [[IF_END]]
+; NONE:       if.end:
+; NONE-NEXT:    ret void
 ;
 entry:
   br i1 %cond, label %if.true, label %if.false
@@ -41,16 +58,27 @@ if.end:
 
 ;; Successor 1 branches to successor 0.
 define void @succ1to0(ptr %p, ptr %q, i32 %a) {
-; CHECK-LABEL: @succ1to0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
-; CHECK-NEXT:    ret void
+; LOADSTORE-LABEL: @succ1to0(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    ret void
+;
+; NONE-LABEL: @succ1to0(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; NONE-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NONE:       if.end:
+; NONE-NEXT:    ret void
+; NONE:       if.then:
+; NONE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; NONE-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; NONE-NEXT:    br label [[IF_END]]
 ;
 entry:
   %tobool = icmp ne i32 %a, 0
@@ -67,14 +95,45 @@ if.then:
 
 ;; Successor 1 branches to successor 0 and there is a phi node.
 define i32 @succ1to0_phi(ptr %p)  {
-; CHECK-LABEL: @succ1to0_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LOADSTORE-LABEL: @succ1to0_phi(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; LOADSTORE-NEXT:    ret i32 [[TMP3]]
+;
+; STOREONLY-LABEL: @succ1to0_phi(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; STOREONLY-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; STOREONLY:       if.false:
+; STOREONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; STOREONLY-NEXT:    br label [[IF_TRUE]]
+; STOREONLY:       if.true:
+; STOREONLY-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; STOREONLY-NEXT:    ret i32 [[RES]]
+;
+; LOADONLY-LABEL: @succ1to0_phi(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; LOADONLY-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; LOADONLY-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; LOADONLY-NEXT:    ret i32 [[TMP3]]
+;
+; NONEONLY-LABEL: @succ1to0_phi(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; NONEONLY-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONEONLY:       if.false:
+; NONEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; NONEONLY-NEXT:    br label [[IF_TRUE]]
+; NONEONLY:       if.true:
+; NONEONLY-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; NONEONLY-NEXT:    ret i32 [[RES]]
 ;
 entry:
   %cond = icmp eq ptr %p, null
@@ -91,16 +150,28 @@ if.true:
 
 ;; Successor 0 branches to successor 1.
 define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
-; CHECK-LABEL: @succ0to1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    ret void
+; LOADSTORE-LABEL: @succ0to1(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; LOADSTORE-NEXT:    ret void
+;
+; NONE-LABEL: @succ0to1(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; NONE-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONE:       if.false:
+; NONE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; NONE-NEXT:    ret void
+; NONE:       if.true:
+; NONE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; NONE-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; NONE-NEXT:    br label [[IF_FALSE]]
 ;
 entry:
   %cond = icmp eq i32 %a, 0
@@ -121,16 +192,29 @@ if.end:
 
 ;; Load after store can be hoisted.
 define i64 @load_after_store(i32 %a, ptr %b, ptr %p) {
-; CHECK-LABEL: @load_after_store(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i64 [[ZEXT]], i64 0
-; CHECK-NEXT:    ret i64 [[SPEC_SELECT]]
+; LOADSTORE-LABEL: @load_after_store(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; LOADSTORE-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i64 [[ZEXT]], i64 0
+; LOADSTORE-NEXT:    ret i64 [[SPEC_SELECT]]
+;
+; NONE-LABEL: @load_after_store(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; NONE-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[COMMON_RET:%.*]]
+; NONE:       common.ret:
+; NONE-NEXT:    [[COMMON_RET_OP:%.*]] = phi i64 [ [[ZEXT:%.*]], [[IF_TRUE]] ], [ 0, [[ENTRY:%.*]] ]
+; NONE-NEXT:    ret i64 [[COMMON_RET_OP]]
+; NONE:       if.true:
+; NONE-NEXT:    store i32 1, ptr [[B:%.*]], align 4
+; NONE-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; NONE-NEXT:    [[ZEXT]] = zext i16 [[TMP0]] to i64
+; NONE-NEXT:    br label [[COMMON_RET]]
 ;
 entry:
   %cond = icmp eq i32 %a, 0
@@ -148,15 +232,49 @@ if.end:
 
 ;; Speculatable memory read doesn't prevent the hoist.
 define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
-; CHECK-LABEL: @load_skip_speculatable_memory_read(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    ret void
+; LOADSTORE-LABEL: @load_skip_speculatable_memory_read(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; LOADSTORE-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; LOADSTORE-NEXT:    ret void
+;
+; STOREONLY-LABEL: @load_skip_speculatable_memory_read(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; STOREONLY-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; STOREONLY-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; STOREONLY-NEXT:    ret void
+;
+; LOADONLY-LABEL: @load_skip_speculatable_memory_read(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; LOADONLY-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; LOADONLY:       if.false:
+; LOADONLY-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; LOADONLY-NEXT:    ret void
+; LOADONLY:       if.true:
+; LOADONLY-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; LOADONLY-NEXT:    store i32 [[READ]], ptr [[P:%.*]], align 4
+; LOADONLY-NEXT:    br label [[IF_FALSE]]
+;
+; NONEONLY-LABEL: @load_skip_speculatable_memory_read(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; NONEONLY-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONEONLY:       if.false:
+; NONEONLY-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; NONEONLY-NEXT:    ret void
+; NONEONLY:       if.true:
+; NONEONLY-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; NONEONLY-NEXT:    store i32 [[READ]], ptr [[P:%.*]], align 4
+; NONEONLY-NEXT:    br label [[IF_FALSE]]
 ;
 entry:
   %cond = icmp eq i32 %a, 0
@@ -177,15 +295,49 @@ if.end:
 
 ;; Source of the load can be a GEP.
 define i32 @load_from_gep(ptr %p)  {
-; CHECK-LABEL: @load_from_gep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LOADSTORE-LABEL: @load_from_gep(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; LOADSTORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; LOADSTORE-NEXT:    ret i32 [[TMP3]]
+;
+; STOREONLY-LABEL: @load_from_gep(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; STOREONLY-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; STOREONLY:       if.false:
+; STOREONLY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; STOREONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; STOREONLY-NEXT:    br label [[IF_TRUE]]
+; STOREONLY:       if.true:
+; STOREONLY-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; STOREONLY-NEXT:    ret i32 [[RES]]
+;
+; LOADONLY-LABEL: @load_from_gep(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; LOADONLY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; LOADONLY-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; LOADONLY-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; LOADONLY-NEXT:    ret i32 [[TMP3]]
+;
+; NONEONLY-LABEL: @load_from_gep(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; NONEONLY-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONEONLY:       if.false:
+; NONEONLY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; NONEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NONEONLY-NEXT:    br label [[IF_TRUE]]
+; NONEONLY:       if.true:
+; NONEONLY-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; NONEONLY-NEXT:    ret i32 [[RES]]
 ;
 entry:
   %cond = icmp eq ptr %p, null
@@ -203,18 +355,30 @@ if.true:
 
 ;; Metadata range/annotation are kept.
 define void @nondebug_metadata(i1 %cond, ptr %p, ptr %q) {
-; CHECK-LABEL: @nondebug_metadata(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call range(i16 0, 10) <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META5:![0-9]+]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr [[Q]], i32 4, <1 x i1> [[TMP0]]), !annotation [[META5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[P]], i32 2, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    ret void
+; LOADSTORE-LABEL: @nondebug_metadata(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call range(i16 0, 10) <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META5:![0-9]+]]
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr [[Q]], i32 4, <1 x i1> [[TMP0]]), !annotation [[META5]]
+; LOADSTORE-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[P]], i32 2, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    ret void
+;
+; NONE-LABEL: @nondebug_metadata(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONE:       if.false:
+; NONE-NEXT:    ret void
+; NONE:       if.true:
+; NONE-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2, !range [[RNG5:![0-9]+]]
+; NONE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Q:%.*]], align 4, !annotation [[META6:![0-9]+]]
+; NONE-NEXT:    store i16 [[TMP0]], ptr [[Q]], align 4, !annotation [[META6]]
+; NONE-NEXT:    store i32 [[TMP1]], ptr [[P]], align 2
+; NONE-NEXT:    br label [[IF_FALSE]]
 ;
 entry:
   br i1 %cond, label %if.true, label %if.false
@@ -231,12 +395,41 @@ if.true:
 }
 
 define i16 @debug_metadata_diassign(i1 %cond, i16 %a, ptr %p) {
-; CHECK-LABEL: @debug_metadata_diassign(
-; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
-; CHECK-NEXT:    ret i16 [[SPEC_SELECT]]
+; LOADSTORE-LABEL: @debug_metadata_diassign(
+; LOADSTORE-NEXT:  bb0:
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
+; LOADSTORE-NEXT:    ret i16 [[SPEC_SELECT]]
+;
+; STOREONLY-LABEL: @debug_metadata_diassign(
+; STOREONLY-NEXT:  bb0:
+; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
+; STOREONLY-NEXT:    ret i16 [[SPEC_SELECT]]
+;
+; LOADONLY-LABEL: @debug_metadata_diassign(
+; LOADONLY-NEXT:  bb0:
+; LOADONLY-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; LOADONLY:       if.true:
+; LOADONLY-NEXT:    store i16 7, ptr [[P:%.*]], align 4, !DIAssignID [[DIASSIGNID7:![0-9]+]]
+; LOADONLY-NEXT:    br label [[IF_FALSE]]
+; LOADONLY:       if.false:
+; LOADONLY-NEXT:    [[RET:%.*]] = phi i16 [ 2, [[BB0:%.*]] ], [ 3, [[IF_TRUE]] ]
+; LOADONLY-NEXT:      #dbg_assign(i16 [[RET]], [[META8:![0-9]+]], !DIExpression(), [[DIASSIGNID7]], ptr [[P]], !DIExpression(), [[META11:![0-9]+]])
+; LOADONLY-NEXT:    ret i16 [[RET]]
+;
+; NONEONLY-LABEL: @debug_metadata_diassign(
+; NONEONLY-NEXT:  bb0:
+; NONEONLY-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONEONLY:       if.true:
+; NONEONLY-NEXT:    store i16 7, ptr [[P:%.*]], align 4, !DIAssignID [[DIASSIGNID7:![0-9]+]]
+; NONEONLY-NEXT:    br label [[IF_FALSE]]
+; NONEONLY:       if.false:
+; NONEONLY-NEXT:    [[RET:%.*]] = phi i16 [ 2, [[BB0:%.*]] ], [ 3, [[IF_TRUE]] ]
+; NONEONLY-NEXT:      #dbg_assign(i16 [[RET]], [[META8:![0-9]+]], !DIExpression(), [[DIASSIGNID7]], ptr [[P]], !DIExpression(), [[META11:![0-9]+]])
+; NONEONLY-NEXT:    ret i16 [[RET]]
 ;
 bb0:
   br i1 %cond, label %if.true, label %if.false
@@ -253,14 +446,45 @@ if.false:
 
 ;; Not crash when working with opt controlled by simplifycfg-hoist-cond-stores.
 define i32 @hoist_cond_stores(i1 %cond, ptr %p) {
-; CHECK-LABEL: @hoist_cond_stores(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i1 false, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND:%.*]], i1 false, i1 false
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
-; CHECK-NEXT:    ret i32 0
+; LOADSTORE-LABEL: @hoist_cond_stores(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    store i1 false, ptr [[P:%.*]], align 2
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND]], i1 false, i1 false
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
+; LOADSTORE-NEXT:    ret i32 0
+;
+; STOREONLY-LABEL: @hoist_cond_stores(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    store i1 false, ptr [[P:%.*]], align 2
+; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; STOREONLY-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND]], i1 false, i1 false
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
+; STOREONLY-NEXT:    ret i32 0
+;
+; LOADONLY-LABEL: @hoist_cond_stores(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    store i1 false, ptr [[P:%.*]], align 2
+; LOADONLY-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; LOADONLY:       if.true:
+; LOADONLY-NEXT:    store i32 0, ptr [[P]], align 8
+; LOADONLY-NEXT:    store i1 false, ptr [[P]], align 2
+; LOADONLY-NEXT:    br label [[IF_FALSE]]
+; LOADONLY:       if.false:
+; LOADONLY-NEXT:    ret i32 0
+;
+; NONEONLY-LABEL: @hoist_cond_stores(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    store i1 false, ptr [[P:%.*]], align 2
+; NONEONLY-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONEONLY:       if.true:
+; NONEONLY-NEXT:    store i32 0, ptr [[P]], align 8
+; NONEONLY-NEXT:    store i1 false, ptr [[P]], align 2
+; NONEONLY-NEXT:    br label [[IF_FALSE]]
+; NONEONLY:       if.false:
+; NONEONLY-NEXT:    ret i32 0
 ;
 entry:
   store i1 false, ptr %p, align 2
@@ -277,18 +501,33 @@ if.false:                                    ; preds = %if.true, %entry
 
 ;; Both of successor 0 and successor 1 have a single predecessor.
 define i32 @single_predecessor(ptr %p, ptr %q, i32 %a) {
-; CHECK-LABEL: @single_predecessor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3
-; CHECK-NEXT:    ret i32 [[DOT]]
+; LOADSTORE-LABEL: @single_predecessor(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3
+; LOADSTORE-NEXT:    ret i32 [[DOT]]
+;
+; NONE-LABEL: @single_predecessor(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; NONE-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NONE:       common.ret:
+; NONE-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 2, [[IF_END]] ], [ 3, [[IF_THEN]] ]
+; NONE-NEXT:    ret i32 [[COMMON_RET_OP]]
+; NONE:       if.end:
+; NONE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; NONE-NEXT:    br label [[COMMON_RET:%.*]]
+; NONE:       if.then:
+; NONE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
+; NONE-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; NONE-NEXT:    br label [[COMMON_RET]]
 ;
 entry:
   %tobool = icmp ne i32 %a, 0
@@ -306,16 +545,55 @@ if.then:
 
 ;; Hoist 6 stores.
 define void @threshold_6(i1 %cond, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6) {
-; CHECK-LABEL: @threshold_6(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    ret void
+; LOADSTORE-LABEL: @threshold_6(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    ret void
+;
+; STOREONLY-LABEL: @threshold_6(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    ret void
+;
+; LOADONLY-LABEL: @threshold_6(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; LOADONLY:       if.true:
+; LOADONLY-NEXT:    store i32 1, ptr [[P1:%.*]], align 4
+; LOADONLY-NEXT:    store i32 2, ptr [[P2:%.*]], align 4
+; LOADONLY-NEXT:    store i32 3, ptr [[P3:%.*]], align 4
+; LOADONLY-NEXT:    store i32 4, ptr [[P4:%.*]], align 4
+; LOADONLY-NEXT:    store i32 5, ptr [[P5:%.*]], align 4
+; LOADONLY-NEXT:    store i32 6, ptr [[P6:%.*]], align 4
+; LOADONLY-NEXT:    br label [[IF_FALSE]]
+; LOADONLY:       if.false:
+; LOADONLY-NEXT:    ret void
+;
+; NONEONLY-LABEL: @threshold_6(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONEONLY:       if.true:
+; NONEONLY-NEXT:    store i32 1, ptr [[P1:%.*]], align 4
+; NONEONLY-NEXT:    store i32 2, ptr [[P2:%.*]], align 4
+; NONEONLY-NEXT:    store i32 3, ptr [[P3:%.*]], align 4
+; NONEONLY-NEXT:    store i32 4, ptr [[P4:%.*]], align 4
+; NONEONLY-NEXT:    store i32 5, ptr [[P5:%.*]], align 4
+; NONEONLY-NEXT:    store i32 6, ptr [[P6:%.*]], align 4
+; NONEONLY-NEXT:    br label [[IF_FALSE]]
+; NONEONLY:       if.false:
+; NONEONLY-NEXT:    ret void
 ;
 entry:
   br i1 %cond, label %if.true, label %if.false
@@ -578,16 +856,49 @@ if.true:
 
 ;; Not hoist if the branch is predictable and the `then` BB is not likely to execute.
 define void @not_likely_to_execute(ptr %p, ptr %q, i32 %a) {
-; CHECK-LABEL: @not_likely_to_execute(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF6:![0-9]+]]
-; CHECK:       if.end:
-; CHECK-NEXT:    ret void
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
+; LOADSTORE-LABEL: @not_likely_to_execute(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; LOADSTORE-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF6:![0-9]+]]
+; LOADSTORE:       if.end:
+; LOADSTORE-NEXT:    ret void
+; LOADSTORE:       if.then:
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; LOADSTORE-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; LOADSTORE-NEXT:    br label [[IF_END]]
+;
+; STOREONLY-LABEL: @not_likely_to_execute(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; STOREONLY-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF7:![0-9]+]]
+; STOREONLY:       if.end:
+; STOREONLY-NEXT:    ret void
+; STOREONLY:       if.then:
+; STOREONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; STOREONLY-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; STOREONLY-NEXT:    br label [[IF_END]]
+;
+; LOADONLY-LABEL: @not_likely_to_execute(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; LOADONLY-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF12:![0-9]+]]
+; LOADONLY:       if.end:
+; LOADONLY-NEXT:    ret void
+; LOADONLY:       if.then:
+; LOADONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; LOADONLY-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; LOADONLY-NEXT:    br label [[IF_END]]
+;
+; NONEONLY-LABEL: @not_likely_to_execute(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; NONEONLY-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF12:![0-9]+]]
+; NONEONLY:       if.end:
+; NONEONLY-NEXT:    ret void
+; NONEONLY:       if.then:
+; NONEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; NONEONLY-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; NONEONLY-NEXT:    br label [[IF_END]]
 ;
 entry:
   %tobool = icmp ne i32 %a, 0
@@ -671,18 +982,57 @@ if.false:
 }
 
 define i32 @succ_phi_has_3input(i1 %cond1, ptr %p, i1 %cond2) {
-; CHECK-LABEL: @succ_phi_has_3input(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    br label [[BB3]]
-; CHECK:       bb3:
-; CHECK-NEXT:    [[Y:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB1]] ]
-; CHECK-NEXT:    store i64 [[Y]], ptr [[P]], align 8
-; CHECK-NEXT:    ret i32 0
+; LOADSTORE-LABEL: @succ_phi_has_3input(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
+; LOADSTORE:       bb1:
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; LOADSTORE-NEXT:    br label [[BB3]]
+; LOADSTORE:       bb3:
+; LOADSTORE-NEXT:    [[Y:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB1]] ]
+; LOADSTORE-NEXT:    store i64 [[Y]], ptr [[P]], align 8
+; LOADSTORE-NEXT:    ret i32 0
+;
+; STOREONLY-LABEL: @succ_phi_has_3input(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    [[COND2_NOT:%.*]] = xor i1 [[COND2:%.*]], true
+; STOREONLY-NEXT:    [[BRMERGE:%.*]] = select i1 [[COND1:%.*]], i1 true, i1 [[COND2_NOT]]
+; STOREONLY-NEXT:    br i1 [[BRMERGE]], label [[BB3:%.*]], label [[BB2:%.*]]
+; STOREONLY:       bb2:
+; STOREONLY-NEXT:    [[X:%.*]] = load i64, ptr [[P:%.*]], align 8
+; STOREONLY-NEXT:    br label [[BB3]]
+; STOREONLY:       bb3:
+; STOREONLY-NEXT:    [[Y:%.*]] = phi i64 [ [[X]], [[BB2]] ], [ 0, [[ENTRY:%.*]] ]
+; STOREONLY-NEXT:    store i64 [[Y]], ptr [[P]], align 8
+; STOREONLY-NEXT:    ret i32 0
+;
+; LOADONLY-LABEL: @succ_phi_has_3input(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
+; LOADONLY:       bb1:
+; LOADONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
+; LOADONLY-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; LOADONLY-NEXT:    br label [[BB3]]
+; LOADONLY:       bb3:
+; LOADONLY-NEXT:    [[Y:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB1]] ]
+; LOADONLY-NEXT:    store i64 [[Y]], ptr [[P]], align 8
+; LOADONLY-NEXT:    ret i32 0
+;
+; NONEONLY-LABEL: @succ_phi_has_3input(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    [[COND2_NOT:%.*]] = xor i1 [[COND2:%.*]], true
+; NONEONLY-NEXT:    [[BRMERGE:%.*]] = select i1 [[COND1:%.*]], i1 true, i1 [[COND2_NOT]]
+; NONEONLY-NEXT:    br i1 [[BRMERGE]], label [[BB3:%.*]], label [[BB2:%.*]]
+; NONEONLY:       bb2:
+; NONEONLY-NEXT:    [[X:%.*]] = load i64, ptr [[P:%.*]], align 8
+; NONEONLY-NEXT:    br label [[BB3]]
+; NONEONLY:       bb3:
+; NONEONLY-NEXT:    [[Y:%.*]] = phi i64 [ [[X]], [[BB2]] ], [ 0, [[ENTRY:%.*]] ]
+; NONEONLY-NEXT:    store i64 [[Y]], ptr [[P]], align 8
+; NONEONLY-NEXT:    ret i32 0
 ;
 entry:
   br i1 %cond1, label %bb3, label %bb1
@@ -701,16 +1051,28 @@ bb3:                                                ; preds = %bb2, %bb1, %entry
 }
 
 define i32 @succ1to0_phi2(ptr %p, ptr %p2) {
-; CHECK-LABEL: @succ1to0_phi2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LOADSTORE-LABEL: @succ1to0_phi2(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    ret i32 [[TMP3]]
+;
+; NONE-LABEL: @succ1to0_phi2(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; NONE-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONE:       if.false:
+; NONE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; NONE-NEXT:    store i32 [[TMP0]], ptr [[P2:%.*]], align 4
+; NONE-NEXT:    br label [[IF_TRUE]]
+; NONE:       if.true:
+; NONE-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; NONE-NEXT:    ret i32 [[RES]]
 ;
 entry:
   %cond = icmp eq ptr %p, null
@@ -727,19 +1089,33 @@ if.true:
 }
 
 define i32 @succ1to0_phi3(ptr %p, ptr %p2, i32 %x) {
-; CHECK-LABEL: @succ1to0_phi3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[X:%.*]] to <1 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[SPEC_SELECT]], [[TMP4]]
-; CHECK-NEXT:    ret i32 [[RES]]
+; LOADSTORE-LABEL: @succ1to0_phi3(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i32 [[X:%.*]] to <1 x i32>
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 [[X]], i32 [[TMP4]]
+; LOADSTORE-NEXT:    [[RES:%.*]] = add i32 [[TMP4]], [[SPEC_SELECT]]
+; LOADSTORE-NEXT:    ret i32 [[RES]]
+;
+; NONE-LABEL: @succ1to0_phi3(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; NONE-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NONE:       if.false:
+; NONE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; NONE-NEXT:    store i32 [[TMP0]], ptr [[P2:%.*]], align 4
+; NONE-NEXT:    br label [[IF_TRUE]]
+; NONE:       if.true:
+; NONE-NEXT:    [[RES0:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; NONE-NEXT:    [[RES1:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ [[X:%.*]], [[ENTRY]] ]
+; NONE-NEXT:    [[RES:%.*]] = add i32 [[RES0]], [[RES1]]
+; NONE-NEXT:    ret i32 [[RES]]
 ;
 entry:
   %cond = icmp eq ptr %p, null
@@ -795,6 +1171,63 @@ return:                                           ; preds = %sw.bb, %entry.if
   ret i32 %ret
 }
 
+;; Check cond-faulting-load can work with hoisted store when no cond-faulting-store.
+define void @hoist_store_without_cstore(ptr %0, ptr %1, i1 %cmp) {
+; LOADSTORE-LABEL: @hoist_store_without_cstore(
+; LOADSTORE-NEXT:  entry:
+; LOADSTORE-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i1 [[CMP:%.*]] to <1 x i1>
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP0:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[TMP1]], i32 8, <1 x i1> [[TMP2]])
+; LOADSTORE-NEXT:    ret void
+;
+; STOREONLY-LABEL: @hoist_store_without_cstore(
+; STOREONLY-NEXT:  entry:
+; STOREONLY-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
+; STOREONLY-NEXT:    br i1 [[CMP:%.*]], label [[IF_THEN1:%.*]], label [[IF_END:%.*]]
+; STOREONLY:       if.then1:
+; STOREONLY-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0:%.*]], align 4
+; STOREONLY-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 8
+; STOREONLY-NEXT:    br label [[IF_END]]
+; STOREONLY:       if.end:
+; STOREONLY-NEXT:    ret void
+;
+; LOADONLY-LABEL: @hoist_store_without_cstore(
+; LOADONLY-NEXT:  entry:
+; LOADONLY-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
+; LOADONLY-NEXT:    [[TMP2:%.*]] = bitcast i1 [[CMP:%.*]] to <1 x i1>
+; LOADONLY-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP0:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; LOADONLY-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; LOADONLY-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP4]], i32 0
+; LOADONLY-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[TMP1]], align 8
+; LOADONLY-NEXT:    ret void
+;
+; NONEONLY-LABEL: @hoist_store_without_cstore(
+; NONEONLY-NEXT:  entry:
+; NONEONLY-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
+; NONEONLY-NEXT:    br i1 [[CMP:%.*]], label [[IF_THEN1:%.*]], label [[IF_END:%.*]]
+; NONEONLY:       if.then1:
+; NONEONLY-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0:%.*]], align 4
+; NONEONLY-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 8
+; NONEONLY-NEXT:    br label [[IF_END]]
+; NONEONLY:       if.end:
+; NONEONLY-NEXT:    ret void
+;
+entry:
+  store i32 0, ptr %1, align 8
+  br i1 %cmp, label %if.then1, label %if.end
+
+if.then1:                                         ; preds = %entry
+  %2 = load i32, ptr %0, align 4
+  store i32 %2, ptr %1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then1, %entry
+  ret void
+}
+
 declare i32 @read_memory_only() readonly nounwind willreturn speculatable
 
 !llvm.dbg.cu = !{!0}


        


More information about the llvm-commits mailing list