[llvm] [Sink] Allow sinking of loads to distant blocks (PR #135986)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 16 09:49:22 PDT 2025
https://github.com/LU-JOHN created https://github.com/llvm/llvm-project/pull/135986
Allow more loads to be sunk by ensuring that no conflicting stores are on paths to the target block.
>From 36eeb25bcd0d8a4b85489413bc1b1757d89e988d Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 16 Apr 2025 06:39:59 -0500
Subject: [PATCH 1/4] Extend Sink pass to allow loads to be sunk to
non-immediate successor blocks
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Transforms/Scalar/Sink.cpp | 76 +++++++++----
llvm/test/Transforms/Sink/loadsink.ll | 152 ++++++++++++++++++++++++++
2 files changed, 204 insertions(+), 24 deletions(-)
create mode 100644 llvm/test/Transforms/Sink/loadsink.ll
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 1a48a59c4189e..57ce0c8990f4a 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -27,43 +27,71 @@ using namespace llvm;
STATISTIC(NumSunk, "Number of instructions sunk");
STATISTIC(NumSinkIter, "Number of sinking iterations");
-static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
- SmallPtrSetImpl<Instruction *> &Stores) {
-
- if (Inst->mayWriteToMemory()) {
- Stores.insert(Inst);
- return false;
- }
-
+static bool hasStoreConflict(Instruction *Inst, AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
MemoryLocation Loc = MemoryLocation::get(L);
for (Instruction *S : Stores)
if (isModSet(AA.getModRefInfo(S, Loc)))
- return false;
+ return true;
+ } else if (auto *Call = dyn_cast<CallBase>(Inst)) {
+ for (Instruction *S : Stores)
+ if (isModSet(AA.getModRefInfo(S, Call)))
+ return true;
}
+ return false;
+}
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
+ if (Inst->mayWriteToMemory()) {
+ Stores.insert(Inst);
+ return false;
+ }
if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
Inst->mayThrow() || !Inst->willReturn())
return false;
-
- if (auto *Call = dyn_cast<CallBase>(Inst)) {
- // Convergent operations cannot be made control-dependent on additional
- // values.
+ // Convergent operations cannot be made control-dependent on additional
+ // values.
+ if (auto *Call = dyn_cast<CallBase>(Inst))
if (Call->isConvergent())
return false;
+ if (hasStoreConflict(Inst, AA, Stores))
+ return false;
+ return true;
+}
- for (Instruction *S : Stores)
- if (isModSet(AA.getModRefInfo(S, Call)))
- return false;
- }
+typedef SmallPtrSet<BasicBlock *, 8> BlocksSet;
+static void findStores(SmallPtrSetImpl<Instruction *> &Stores,
+ BasicBlock *LoadBB, BasicBlock *BB,
+ BlocksSet &VisitedBlocksSet) {
+ if (BB == LoadBB || VisitedBlocksSet.contains(BB))
+ return;
+ VisitedBlocksSet.insert(BB);
+
+ for (Instruction &Inst : *BB)
+ if (Inst.mayWriteToMemory())
+ Stores.insert(&Inst);
+ for (BasicBlock *Pred : predecessors(BB))
+ findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
+}
- return true;
+static bool hasConflictingStoreBeforeSuccToSinkTo(AliasAnalysis &AA,
+ Instruction *ReadMemInst,
+ BasicBlock *SuccToSinkTo) {
+ BlocksSet VisitedBlocksSet;
+ SmallPtrSet<Instruction *, 8> Stores;
+ BasicBlock *LoadBB = ReadMemInst->getParent();
+ for (BasicBlock *Pred : predecessors(SuccToSinkTo))
+ findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
+ return hasStoreConflict(ReadMemInst, AA, Stores);
}
/// IsAcceptableTarget - Return true if it is possible to sink the instruction
/// in the specified basic block.
-static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
- DominatorTree &DT, LoopInfo &LI) {
+static bool IsAcceptableTarget(AliasAnalysis &AA, Instruction *Inst,
+ BasicBlock *SuccToSinkTo, DominatorTree &DT,
+ LoopInfo &LI) {
assert(Inst && "Instruction to be sunk is null");
assert(SuccToSinkTo && "Candidate sink target is null");
@@ -76,10 +104,10 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
// just punt.
// FIXME: Split critical edges if not backedges.
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
- // We cannot sink a load across a critical edge - there may be stores in
- // other code paths.
+ // Ensure that there is no conflicting store on any path to SuccToSinkTo.
if (Inst->mayReadFromMemory() &&
- !Inst->hasMetadata(LLVMContext::MD_invariant_load))
+ !Inst->hasMetadata(LLVMContext::MD_invariant_load) &&
+ hasConflictingStoreBeforeSuccToSinkTo(AA, Inst, SuccToSinkTo))
return false;
// We don't want to sink across a critical edge if we don't dominate the
@@ -153,7 +181,7 @@ static bool SinkInstruction(Instruction *Inst,
// The nearest common dominator may be in a parent loop of BB, which may not
// be beneficial. Find an ancestor.
while (SuccToSinkTo != BB &&
- !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+ !IsAcceptableTarget(AA, Inst, SuccToSinkTo, DT, LI))
SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
if (SuccToSinkTo == BB)
SuccToSinkTo = nullptr;
diff --git a/llvm/test/Transforms/Sink/loadsink.ll b/llvm/test/Transforms/Sink/loadsink.ll
new file mode 100644
index 0000000000000..d1fbf740250f6
--- /dev/null
+++ b/llvm/test/Transforms/Sink/loadsink.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -passes=sink | FileCheck %s
+
+; Test that loads can be sunk to a non-immediate successor block by analyzing
+; paths for conflicting stores.
+
+declare void @readfunc() readonly willreturn
+declare void @maywritefunc() willreturn
+
+; Load can be sunk to non-immediate successor
+define void @load_can_sink(i1 %condA, i1 %condB, ptr %a, ptr %b) {
+; CHECK-LABEL: @load_can_sink(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MERGEA:%.*]]
+; CHECK: mergeA:
+; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]]
+; CHECK: thenA:
+; CHECK-NEXT: call void @readfunc()
+; CHECK-NEXT: br label [[MERGEB]]
+; CHECK: mergeB:
+; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]]
+; CHECK: thenB:
+; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT: store i32 [[VALUE]], ptr [[B:%.*]], align 4
+; CHECK-NEXT: br label [[MERGEC]]
+; CHECK: mergeC:
+; CHECK-NEXT: ret void
+;
+entry:
+ %value = load i32, ptr %a, align 4
+ br label %mergeA
+mergeA:
+ br i1 %condA, label %thenA, label %mergeB
+thenA:
+ call void @readfunc()
+ br label %mergeB
+mergeB:
+ br i1 %condB, label %thenB, label %mergeC
+thenB:
+ store i32 %value, ptr %b
+ br label %mergeC
+mergeC:
+ ret void
+}
+
+; Call may store so load cannot be sunk
+define void @load_cannot_sink(i1 %condA, i1 %condB, ptr %a, ptr %b) {
+; CHECK-LABEL: @load_cannot_sink(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MERGEA:%.*]]
+; CHECK: mergeA:
+; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]]
+; CHECK: thenA:
+; CHECK-NEXT: call void @maywritefunc()
+; CHECK-NEXT: br label [[MERGEB]]
+; CHECK: mergeB:
+; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]]
+; CHECK: thenB:
+; CHECK-NEXT: store i32 [[VALUE]], ptr [[B:%.*]], align 4
+; CHECK-NEXT: br label [[MERGEC]]
+; CHECK: mergeC:
+; CHECK-NEXT: ret void
+;
+entry:
+ %value = load i32, ptr %a, align 4
+ br label %mergeA
+mergeA:
+ br i1 %condA, label %thenA, label %mergeB
+thenA:
+ call void @maywritefunc()
+ br label %mergeB
+mergeB:
+ br i1 %condB, label %thenB, label %mergeC
+thenB:
+ store i32 %value, ptr %b
+ br label %mergeC
+mergeC:
+ ret void
+}
+
+; Load can be sunk to non-immediate successor because load ptr is noalias
+define void @load_can_sink_noalias(i1 %condA, i1 %condB, ptr noalias %a, ptr %b) {
+; CHECK-LABEL: @load_can_sink_noalias(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MERGEA:%.*]]
+; CHECK: mergeA:
+; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]]
+; CHECK: thenA:
+; CHECK-NEXT: store i32 0, ptr [[B:%.*]], align 4
+; CHECK-NEXT: br label [[MERGEB]]
+; CHECK: mergeB:
+; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]]
+; CHECK: thenB:
+; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT: store i32 [[VALUE]], ptr [[B]], align 4
+; CHECK-NEXT: br label [[MERGEC]]
+; CHECK: mergeC:
+; CHECK-NEXT: ret void
+;
+entry:
+ %value = load i32, ptr %a, align 4
+ br label %mergeA
+mergeA:
+ br i1 %condA, label %thenA, label %mergeB
+thenA:
+ store i32 0, ptr %b
+ br label %mergeB
+mergeB:
+ br i1 %condB, label %thenB, label %mergeC
+thenB:
+ store i32 %value, ptr %b
+ br label %mergeC
+mergeC:
+ ret void
+}
+
+; Load cannot be sunk to non-immediate successor because load ptr may alias
+define void @load_cannot_sink_alias(i1 %condA, i1 %condB, ptr %a, ptr %b) {
+; CHECK-LABEL: @load_cannot_sink_alias(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MERGEA:%.*]]
+; CHECK: mergeA:
+; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]]
+; CHECK: thenA:
+; CHECK-NEXT: store i32 0, ptr [[B:%.*]], align 4
+; CHECK-NEXT: br label [[MERGEB]]
+; CHECK: mergeB:
+; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]]
+; CHECK: thenB:
+; CHECK-NEXT: store i32 [[VALUE]], ptr [[B]], align 4
+; CHECK-NEXT: br label [[MERGEC]]
+; CHECK: mergeC:
+; CHECK-NEXT: ret void
+;
+entry:
+ %value = load i32, ptr %a, align 4
+ br label %mergeA
+mergeA:
+ br i1 %condA, label %thenA, label %mergeB
+thenA:
+ store i32 0, ptr %b
+ br label %mergeB
+mergeB:
+ br i1 %condB, label %thenB, label %mergeC
+thenB:
+ store i32 %value, ptr %b
+ br label %mergeC
+mergeC:
+ ret void
+}
>From f9fc4a87fbf92dc4268edbab85db6103c2318767 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 16 Apr 2025 06:41:41 -0500
Subject: [PATCH 2/4] Update tests
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 86 ++++++++++---------
...ne-sink-temporal-divergence-swdev407790.ll | 27 +++---
llvm/test/CodeGen/AMDGPU/set-wave-priority.ll | 3 +-
.../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 5 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 40 ++++-----
5 files changed, 82 insertions(+), 79 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 074272f7bed86..28ade94040688 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1330,13 +1330,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s2, 0
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX7-NEXT: s_mov_b64 vcc, 0
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
@@ -1355,24 +1349,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX7-NEXT: s_or_b64 vcc, s[8:9], s[0:1]
; GFX7-NEXT: .LBB13_2: ; %exit
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX8-NEXT: s_mov_b64 vcc, 0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
@@ -1391,12 +1383,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX8-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX8-NEXT: .LBB13_2: ; %exit
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1]
+; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 8
; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1404,12 +1404,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W32: ; %bb.0: ; %entry
-; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
-; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W32-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
+; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX10_W32-NEXT: s_and_saveexec_b32 s1, s0
; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W32-NEXT: ; %bb.1: ; %bb
@@ -1426,9 +1422,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX10_W32-NEXT: .LBB13_2: ; %exit
; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX10_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10_W32-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
+; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8
@@ -1436,12 +1437,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W64: ; %bb.0: ; %entry
-; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT: s_mov_b64 vcc, 0
-; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W64-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX10_W64-NEXT: s_mov_b64 vcc, 0
; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W64-NEXT: ; %bb.1: ; %bb
@@ -1458,9 +1455,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX10_W64-NEXT: .LBB13_2: ; %exit
; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX10_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10_W64-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
+; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8
@@ -1468,14 +1470,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W32: ; %bb.0: ; %entry
-; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
-; GFX11_W32-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0
-; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v3
-; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W32-NEXT: ; %bb.1: ; %bb
; GFX11_W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x50
@@ -1491,6 +1489,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX11_W32-NEXT: .LBB13_2: ; %exit
; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
+; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W32-NEXT: s_waitcnt vmcnt(0)
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
@@ -1501,14 +1503,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W64: ; %bb.0: ; %entry
-; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
-; GFX11_W64-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W64-NEXT: s_mov_b64 vcc, 0
; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec
-; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v3
-; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
-; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W64-NEXT: ; %bb.1: ; %bb
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x50
@@ -1524,6 +1522,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX11_W64-NEXT: .LBB13_2: ; %exit
; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
+; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W64-NEXT: s_waitcnt vmcnt(0)
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 4a6b2ebd3d203..500659ea0ca86 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -877,14 +877,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_lshl_b32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s53, s4, 1
; CHECK-NEXT: s_add_i32 s6, s4, 5
-; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_read_u8 v46, v0
-; CHECK-NEXT: v_mov_b32_e32 v56, s53
+; CHECK-NEXT: v_or3_b32 v46, s5, v42, s53
+; CHECK-NEXT: v_mov_b32_e32 v47, s53
; CHECK-NEXT: s_mov_b32 s5, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_5
@@ -898,46 +895,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_add_i32 s7, s7, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_add_i32 s8, s4, s7
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v46
; CHECK-NEXT: s_add_i32 s9, s8, 5
; CHECK-NEXT: s_add_i32 s8, s8, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
+; CHECK-NEXT: v_mov_b32_e32 v47, s8
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
; CHECK-NEXT: ; %bb.4: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, v0
+; CHECK-NEXT: v_mov_b32_e32 v46, v0
; CHECK-NEXT: .LBB1_5: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_mov_b32 s54, exec_lo
-; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT: v_cmpx_lt_u32_e64 v47, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_11
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_mov_b32 s55, 0
+; CHECK-NEXT: ds_read_u8 v56, v0
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB1_8
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_7: ; %.114
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64
-; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_cbranch_execz .LBB1_10
; CHECK-NEXT: .LBB1_8: ; %.103
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v47
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s64, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_7
; CHECK-NEXT: ; %bb.9: ; %.110
@@ -958,7 +957,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: ds_write_b32 v0, v47
+; CHECK-NEXT: ds_write_b32 v0, v46
; CHECK-NEXT: s_branch .LBB1_7
; CHECK-NEXT: .LBB1_10: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
index a27d1217031ca..0e30b4bb5925c 100644
--- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -72,13 +72,14 @@ entry:
a:
%v2 = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0)
+ %v3 = fadd <2 x float> %v1, %v2
%v20 = extractelement <2 x float> %v2, i32 0
%v21 = extractelement <2 x float> %v2, i32 1
%cond2 = fcmp ult float %v20, %v21
br i1 %cond2, label %b, label %c
b:
- ret <2 x float> %v2
+ ret <2 x float> %v3
c:
%v4 = fadd <2 x float> %v1, %v1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 8dfd841671730..7426ecca7301a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -41,7 +41,8 @@ bb:
%tmp20 = extractelement <4 x float> %tmp18, i32 1
%tmp21 = extractelement <4 x float> %tmp18, i32 2
%tmp22 = extractelement <4 x float> %tmp18, i32 3
- %tmp23 = bitcast float %tmp14 to i32
+ %tmp23 = fadd float %tmp14, %tmp22
+ %tmp24 = bitcast float %tmp23 to i32
br label %bb24
bb24: ; preds = %bb157, %bb
@@ -218,7 +219,7 @@ bb156: ; preds = %bb24
bb157: ; preds = %bb24
%tmp158 = bitcast float %tmp107 to i32
%tmp159 = bitcast float %tmp107 to i32
- %tmp160 = add i32 %tmp23, %tmp159
+ %tmp160 = add i32 %tmp24, %tmp159
%tmp161 = bitcast i32 %tmp160 to float
%tmp162 = insertelement <128 x float> poison, float %tmp103, i32 0
%tmp163 = insertelement <128 x float> %tmp162, float %tmp102, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4212fd3b35cd8..396c06cfbc540 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1266,26 +1266,26 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
; GFX1032-NEXT: s_clause 0x1
; GFX1032-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
-; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
; GFX1032-NEXT: s_mov_b32 vcc_lo, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11]
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
; GFX1032-NEXT: s_cbranch_execz .LBB22_2
; GFX1032-NEXT: ; %bb.1: ; %bb
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v1, s[2:3] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo
; GFX1032-NEXT: .LBB22_2: ; %exit
-; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx3 v[0:2], v0, s[10:11]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX1032-NEXT: global_store_dword v0, v1, s[8:9] offset:8
+; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v2
+; GFX1032-NEXT: global_store_dword v3, v0, s[8:9] offset:8
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc:
@@ -1293,26 +1293,26 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
; GFX1064-NEXT: s_clause 0x1
; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX1064-NEXT: s_mov_b64 vcc, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11]
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB22_2
; GFX1064-NEXT: ; %bb.1: ; %bb
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v1, s[6:7] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX1064-NEXT: s_and_b64 vcc, vcc, exec
; GFX1064-NEXT: .LBB22_2: ; %exit
-; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx3 v[0:2], v0, s[10:11]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX1064-NEXT: global_store_dword v0, v1, s[8:9] offset:8
+; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v2
+; GFX1064-NEXT: global_store_dword v3, v0, s[8:9] offset:8
; GFX1064-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
>From e78335e482c9563b7f4d20cf82955cbd4167fb02 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 16 Apr 2025 11:21:03 -0500
Subject: [PATCH 3/4] Update NumSgprs for GCN Trackers test
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index c5732531f5423..ec95a7ed03b95 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
}
; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
-; GFX908: NumSgprs: 64
-; GFX908-GCNTRACKERS: NumSgprs: 64
+; GFX908: NumSgprs: 56
+; GFX908-GCNTRACKERS: NumSgprs: 56
; GFX908: NumVgprs: 43
; GFX908-GCNTRACKERS: NumVgprs: 39
; GFX908: Occupancy: 5
>From 54807132163ec97e2e3f3ffedb2e3513424b985d Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 16 Apr 2025 11:41:53 -0500
Subject: [PATCH 4/4] Add test that sinks part of the way to the use
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/test/Transforms/Sink/loadsink.ll | 46 +++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/llvm/test/Transforms/Sink/loadsink.ll b/llvm/test/Transforms/Sink/loadsink.ll
index d1fbf740250f6..9a9c106559d44 100644
--- a/llvm/test/Transforms/Sink/loadsink.ll
+++ b/llvm/test/Transforms/Sink/loadsink.ll
@@ -150,3 +150,49 @@ thenB:
mergeC:
ret void
}
+
+; Load can be sunk, but not all the way to the use.
+define void @load_can_sink_part_of_the_way(i1 %condA, i1 %condB, i1 %condC, ptr noalias %a, ptr %b) {
+; CHECK-LABEL: @load_can_sink_part_of_the_way(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[MERGEA:%.*]]
+; CHECK: mergeA:
+; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]]
+; CHECK: thenA:
+; CHECK-NEXT: store i32 0, ptr [[B:%.*]], align 4
+; CHECK-NEXT: br label [[MERGEB]]
+; CHECK: mergeB:
+; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]]
+; CHECK: thenB:
+; CHECK-NEXT: call void @maywritefunc()
+; CHECK-NEXT: br label [[MERGEC]]
+; CHECK: mergeC:
+; CHECK-NEXT: br i1 [[CONDC:%.*]], label [[THENC:%.*]], label [[MERGED:%.*]]
+; CHECK: thenC:
+; CHECK-NEXT: store i32 [[VALUE]], ptr [[B]], align 4
+; CHECK-NEXT: br label [[MERGEC]]
+; CHECK: mergeD:
+; CHECK-NEXT: ret void
+;
+entry:
+ %value = load i32, ptr %a, align 4
+ br label %mergeA
+mergeA:
+ br i1 %condA, label %thenA, label %mergeB
+thenA:
+ store i32 0, ptr %b
+ br label %mergeB
+mergeB:
+ br i1 %condB, label %thenB, label %mergeC
+thenB:
+ call void @maywritefunc()
+ br label %mergeC
+mergeC:
+ br i1 %condC, label %thenC, label %mergeD
+thenC:
+ store i32 %value, ptr %b
+ br label %mergeC
+mergeD:
+ ret void
+}
More information about the llvm-commits
mailing list