[llvm] [FlattenCFG] Fixup Phi nodes during CFG flattening (PR #143766)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 14 14:25:30 PDT 2025
https://github.com/HighW4y2H3ll updated https://github.com/llvm/llvm-project/pull/143766
>From 7194cc0a4e20d1ad5f90c1faf70ad4beca49f05b Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Fri, 13 Jun 2025 16:15:00 -0700
Subject: [PATCH 1/4] [FlattenCFG] Flatten CFG with Phi nodes
---
llvm/lib/Transforms/Utils/FlattenCFG.cpp | 51 ++++++++++++------
.../SimplifyCFG/flatten-cfg-with-phi.ll | 53 +++++++++++++++++++
2 files changed, 89 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 1d9408d6db433..cde9b040224cc 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -134,10 +134,6 @@ class FlattenCFGOpt {
/// its predecessor. In Case 2, BB (BB3) only has conditional branches
/// as its predecessors.
bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
- PHINode *PHI = dyn_cast<PHINode>(BB->begin());
- if (PHI)
- return false; // For simplicity, avoid cases containing PHI nodes.
-
BasicBlock *LastCondBlock = nullptr;
BasicBlock *FirstCondBlock = nullptr;
BasicBlock *UnCondBlock = nullptr;
@@ -208,8 +204,10 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
if (Idx == -1)
Idx = CIdx;
- else if (CIdx != Idx)
- return false;
+ else if (CIdx != Idx) {
+ // Inverse Branch Condition
+ InvertBranch(PBI, Builder);
+ }
// PS is the successor which is not BB. Check successors to identify
// the last conditional branch.
@@ -269,11 +267,6 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
if (!PBI1 || !PBI1->isUnconditional())
return false;
- // PS2 should not contain PHI node.
- PHI = dyn_cast<PHINode>(PS2->begin());
- if (PHI)
- return false;
-
// Do the transformation.
BasicBlock *CB;
BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
@@ -291,17 +284,45 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
// Merge conditions.
Builder.SetInsertPoint(PBI);
Value *NC;
- if (Idx == 0)
- // Case 2, use parallel or.
- NC = Builder.CreateOr(PC, CC);
- else
+ if (UnCondBlock)
// Case 1, use parallel and.
NC = Builder.CreateAnd(PC, CC);
+ else
+ // Case 2, use parallel or.
+ NC = Builder.CreateOr(PC, CC);
+
+ // Fixup PHI node if needed
+ for (BasicBlock *CBS : successors(PBI)) {
+ for (PHINode &Phi : CBS->phis()) {
+ Value *origPhi0 = nullptr;
+ Value *newPhi = nullptr;
+ if (llvm::is_contained(Phi.blocks(), FirstCondBlock)) {
+ origPhi0 = Phi.removeIncomingValue(FirstCondBlock, false);
+ newPhi = origPhi0;
+ }
+ if (llvm::is_contained(Phi.blocks(), CB)) {
+ Value *origPhi1 = Phi.removeIncomingValue(CB, false);
+ newPhi = origPhi1;
+
+ if (origPhi0) {
+ // Swap branch given the conditions
+ if (PBI->getSuccessor(0) == CBS) {
+ newPhi = Builder.CreateSelect(PC, origPhi0, origPhi1);
+ } else {
+ newPhi = Builder.CreateSelect(PC, origPhi1, origPhi0);
+ }
+ }
+ }
+ if (newPhi)
+ Phi.addIncoming(newPhi, FirstCondBlock);
+ }
+ }
PBI->replaceUsesOfWith(CC, NC);
PC = NC;
if (CB == LastCondBlock)
Iteration = false;
+
// Remove internal conditional branches.
CB->dropAllReferences();
// make CB unreachable and let downstream to delete the block.
diff --git a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
new file mode 100644
index 0000000000000..cf7c3f566e8df
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool ./opt --version 5
+; RUN: opt < %s -passes=flatten-cfg -S | FileCheck %s
+
+define i1 @_Z7compareRK1SS1_(ptr %a, ptr %b) {
+; CHECK-LABEL: @_Z7compareRK1SS1_(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = load i32, ptr %a, align 4, !tbaa !3
+; CHECK-NEXT: %1 = load i32, ptr %b, align 4, !tbaa !3
+; CHECK-NEXT: %cmp.i = icmp sge i32 %0, %1
+; CHECK-NEXT: %cmp.i19 = icmp eq i32 %0, %1
+; CHECK-NEXT: %2 = and i1 %cmp.i, %cmp.i19
+; CHECK-NEXT: %3 = select i1 %cmp.i, i1 false, i1 true
+; CHECK-NEXT: br i1 %2, label %land.rhs, label %lor.end
+; CHECK-LABEL: lor.end: ; preds = %land.rhs, %entry
+; CHECK-NEXT: %6 = phi i1 [ %cmp, %land.rhs ], [ %3, %entry ]
+; CHECK-NEXT: ret i1 %6
+entry:
+ %0 = load i32, ptr %a, align 4, !tbaa !3
+ %1 = load i32, ptr %b, align 4, !tbaa !3
+ %cmp.i = icmp slt i32 %0, %1
+ br i1 %cmp.i, label %lor.end, label %lor.rhs
+
+lor.rhs: ; preds = %entry
+ %cmp.i19 = icmp eq i32 %0, %1
+ br i1 %cmp.i19, label %land.rhs, label %lor.end
+
+land.rhs: ; preds = %lor.rhs
+ %y = getelementptr inbounds nuw i8, ptr %a, i64 4
+ %2 = load i32, ptr %y, align 4, !tbaa !8
+ %y14 = getelementptr inbounds nuw i8, ptr %b, i64 4
+ %3 = load i32, ptr %y14, align 4, !tbaa !8
+ %cmp = icmp slt i32 %2, %3
+ br label %lor.end
+
+lor.end: ; preds = %lor.rhs, %land.rhs, %entry
+ %4 = phi i1 [ true, %entry ], [ false, %lor.rhs ], [ %cmp, %land.rhs ]
+ ret i1 %4
+}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{!"clang"}
+!3 = !{!4, !5, i64 0}
+!4 = !{!"_ZTS1S", !5, i64 0, !5, i64 4}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!4, !5, i64 4}
+!9 = !{!5, !5, i64 0}
>From 70d14b5154d3c1b2690615607d34c63292a8eae0 Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Sat, 14 Jun 2025 11:28:08 -0700
Subject: [PATCH 2/4] Set insertation point before calling InvertBranch
---
llvm/lib/Transforms/Utils/FlattenCFG.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index cde9b040224cc..9a1978c7eef2c 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -206,6 +206,8 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
Idx = CIdx;
else if (CIdx != Idx) {
// Inverse Branch Condition
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(PBI);
InvertBranch(PBI, Builder);
}
>From b60caca0e302129459982924c013b012fbdd070e Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Sat, 14 Jun 2025 10:32:33 -0700
Subject: [PATCH 3/4] Fix AMDGPU tests
---
.../AMDGPU/GlobalISel/vni8-across-blocks.ll | 124 +++++++---
...der-no-live-segment-at-def-implicit-def.ll | 66 +++---
.../divergent-branch-uniform-condition.ll | 72 +++---
llvm/test/CodeGen/AMDGPU/jump-address.ll | 2 +-
llvm/test/CodeGen/AMDGPU/predicates.ll | 6 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 212 ++++++------------
6 files changed, 219 insertions(+), 263 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcde..7dcf3dd620030 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -480,29 +480,58 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-LABEL: v8i8_phi_chain:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0
+; GFX906-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v0
+; GFX906-NEXT: s_or_b64 s[2:3], vcc, s[0:1]
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
; GFX906-NEXT: s_cbranch_execz .LBB8_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11]
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX906-NEXT: .LBB8_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT: s_cbranch_execz .LBB8_4
-; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: ; %bb.1: ; %bb.2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX906-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX906-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX906-NEXT: v_cndmask_b32_e32 v4, v13, v7, vcc
+; GFX906-NEXT: v_cndmask_b32_e32 v7, v15, v9, vcc
+; GFX906-NEXT: v_mov_b32_e32 v9, 8
+; GFX906-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX906-NEXT: v_cndmask_b32_e32 v3, v11, v5, vcc
+; GFX906-NEXT: v_cndmask_b32_e32 v5, v12, v6, vcc
+; GFX906-NEXT: v_cndmask_b32_e32 v6, v14, v8, vcc
+; GFX906-NEXT: v_mov_b32_e32 v8, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v8, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v3
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT: v_or3_b32 v1, v0, v1, v3
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v2, v8, v0
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT: v_or3_b32 v2, v0, v2, v3
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
-; GFX906-NEXT: .LBB8_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: .LBB8_2: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
@@ -535,29 +564,50 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: v_cmp_gt_u32_e64 s[0:1], 15, v0
+; GFX906-NEXT: s_and_b64 s[2:3], s[0:1], vcc
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[10:11]
+; GFX906-NEXT: s_mov_b64 vcc, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v6, 8
+; GFX906-NEXT: v_mov_b32_e32 v5, 0xff
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, v3
-; GFX906-NEXT: v_mov_b32_e32 v2, v4
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB9_4
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB9_3
-; GFX906-NEXT: ; %bb.2: ; %bb.2
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v4
+; GFX906-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1]
+; GFX906-NEXT: v_cndmask_b32_sdwa v9, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX906-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[0:1]
+; GFX906-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[0:1]
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_cndmask_b32_e64 v11, v2, v4, s[0:1]
+; GFX906-NEXT: v_cndmask_b32_sdwa v10, v1, v3, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX906-NEXT: v_lshlrev_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v0, v5, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v9
+; GFX906-NEXT: v_cndmask_b32_sdwa v8, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX906-NEXT: v_and_or_b32 v6, v11, v5, v6
+; GFX906-NEXT: v_or3_b32 v5, v0, v7, v10
+; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT: v_cndmask_b32_sdwa v7, v2, v4, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX906-NEXT: v_or3_b32 v6, v6, v0, v7
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX906-NEXT: s_cbranch_execz .LBB9_2
+; GFX906-NEXT: ; %bb.1: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v6, v4
; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
-; GFX906-NEXT: .LBB9_3: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: .LBB9_4: ; %bb.3
+; GFX906-NEXT: v_mov_b32_e32 v5, v3
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
+; GFX906-NEXT: .LBB9_2: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
+; GFX906-NEXT: global_store_dwordx2 v0, v[5:6], s[14:15]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index ad0d6d8016ad6..19f3705c97d4b 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -12,73 +12,59 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: s_mov_b32 s12, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s52, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_9
-; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT: s_cmp_eq_u32 s54, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
-; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT: s_cmp_lg_u32 s55, 0
-; CHECK-NEXT: s_mov_b32 s17, 0
-; CHECK-NEXT: s_cselect_b32 s12, -1, 0
-; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_5
-; CHECK-NEXT: ; %bb.3:
-; CHECK-NEXT: s_mov_b32 s18, 0
-; CHECK-NEXT: s_branch .LBB0_6
-; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: s_mov_b32 s14, s12
-; CHECK-NEXT: s_mov_b32 s15, s12
-; CHECK-NEXT: s_mov_b32 s13, s12
-; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15]
-; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13]
-; CHECK-NEXT: s_branch .LBB0_8
-; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
+; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0
-; CHECK-NEXT: s_mov_b32 s18, 1.0
-; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
-; CHECK-NEXT: .LBB0_6: ; %Flow
-; CHECK-NEXT: s_mov_b32 s48, 1.0
-; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cmp_lg_u32 s55, 0
+; CHECK-NEXT: s_cselect_b32 s17, -1, 0
+; CHECK-NEXT: s_or_b32 s12, s17, s12
+; CHECK-NEXT: s_cmp_lg_u32 s54, 0
+; CHECK-NEXT: s_cselect_b32 s13, -1, 0
+; CHECK-NEXT: s_and_b32 s18, s13, exec_lo
+; CHECK-NEXT: s_cselect_b32 s48, 1.0, 0
+; CHECK-NEXT: s_and_b32 s12, s13, s12
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_mov_b32 s50, s48
+; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
; CHECK-NEXT: s_mov_b32 s51, s48
-; CHECK-NEXT: s_cbranch_vccnz .LBB0_8
-; CHECK-NEXT: ; %bb.7: ; %if.end273.i.i
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_3
+; CHECK-NEXT: ; %bb.2: ; %if.end273.i.i
; CHECK-NEXT: s_add_u32 s12, s8, 40
; CHECK-NEXT: s_addc_u32 s13, s9, 0
-; CHECK-NEXT: s_getpc_b64 s[20:21]
-; CHECK-NEXT: s_add_u32 s20, s20, _Z3dotDv3_fS_ at gotpcrel32@lo+4
-; CHECK-NEXT: s_addc_u32 s21, s21, _Z3dotDv3_fS_ at gotpcrel32@hi+12
+; CHECK-NEXT: s_getpc_b64 s[18:19]
+; CHECK-NEXT: s_add_u32 s18, s18, _Z3dotDv3_fS_ at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s19, s19, _Z3dotDv3_fS_ at gotpcrel32@hi+12
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 1.0, 0, s17
+; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, 0, s17
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT: s_load_dwordx2 s[20:21], s[20:21], 0x0
-; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1
-; CHECK-NEXT: v_add_f32_e64 v1, s17, s18
+; CHECK-NEXT: v_lshlrev_b32_e32 v5, 10, v1
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13]
+; CHECK-NEXT: v_add_f32_e32 v1, v4, v3
; CHECK-NEXT: s_mov_b32 s12, s14
-; CHECK-NEXT: v_or3_b32 v31, v0, v3, v2
-; CHECK-NEXT: v_mov_b32_e32 v0, v1
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_or3_b32 v31, v0, v5, v2
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_mov_b32 s13, s15
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_mov_b32 s14, s16
; CHECK-NEXT: s_mov_b32 s48, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35]
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_mov_b32 s50, s48
; CHECK-NEXT: s_mov_b32 s51, s48
-; CHECK-NEXT: .LBB0_8: ; %if.end294.i.i
+; CHECK-NEXT: .LBB0_3: ; %if.end294.i.i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit
+; CHECK-NEXT: .LBB0_4: ; %kernel_direct_lighting.exit
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
; CHECK-NEXT: v_mov_b32_e32 v0, s48
; CHECK-NEXT: v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 827cb4ac2589a..98aa1f0849eec 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -20,50 +20,48 @@ define amdgpu_ps void @main(i32 %0, float %1) {
; ISA: ; %bb.0: ; %start
; ISA-NEXT: v_readfirstlane_b32 s0, v0
; ISA-NEXT: s_mov_b32 m0, s0
-; ISA-NEXT: s_mov_b32 s10, 0
+; ISA-NEXT: s_mov_b64 s[4:5], 0
; ISA-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x
; ISA-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; ISA-NEXT: s_mov_b64 s[0:1], 0
-; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5
-; ISA-NEXT: ; implicit-def: $sgpr2_sgpr3
-; ISA-NEXT: s_branch .LBB0_3
-; ISA-NEXT: .LBB0_1: ; %Flow1
-; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT: s_or_b64 exec, exec, s[4:5]
-; ISA-NEXT: s_mov_b64 s[8:9], 0
-; ISA-NEXT: s_mov_b64 s[4:5], s[6:7]
-; ISA-NEXT: .LBB0_2: ; %Flow
-; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT: s_and_b64 s[6:7], exec, s[4:5]
-; ISA-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
-; ISA-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
-; ISA-NEXT: s_and_b64 s[6:7], s[8:9], exec
-; ISA-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
-; ISA-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; ISA-NEXT: s_cbranch_execz .LBB0_6
-; ISA-NEXT: .LBB0_3: ; %loop
+; ISA-NEXT: v_mov_b32_e32 v1, 0
+; ISA-NEXT: ; implicit-def: $sgpr10_sgpr11
+; ISA-NEXT: ; implicit-def: $sgpr8_sgpr9
+; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT: s_branch .LBB0_2
+; ISA-NEXT: .LBB0_1: ; %Flow
+; ISA-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; ISA-NEXT: s_or_b64 exec, exec, s[0:1]
+; ISA-NEXT: s_and_b64 s[0:1], exec, s[8:9]
+; ISA-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5]
+; ISA-NEXT: s_andn2_b64 s[0:1], s[6:7], exec
+; ISA-NEXT: s_and_b64 s[2:3], s[10:11], exec
+; ISA-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; ISA-NEXT: s_cbranch_execz .LBB0_4
+; ISA-NEXT: .LBB0_2: ; %loop
; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
-; ISA-NEXT: s_or_b64 s[4:5], s[4:5], exec
-; ISA-NEXT: s_mov_b64 s[6:7], -1
-; ISA-NEXT: s_cmp_lt_u32 s10, 32
-; ISA-NEXT: s_mov_b64 s[8:9], -1
-; ISA-NEXT: s_cbranch_scc0 .LBB0_2
-; ISA-NEXT: ; %bb.4: ; %endif1
-; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; ISA-NEXT: v_cmp_lt_u32_e64 s[0:1], 31, v1
+; ISA-NEXT: v_cmp_gt_u32_e64 s[2:3], 32, v1
+; ISA-NEXT: s_andn2_b64 s[10:11], s[10:11], exec
+; ISA-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; ISA-NEXT: s_and_b64 s[2:3], s[2:3], vcc
+; ISA-NEXT: s_or_b64 s[8:9], s[8:9], exec
+; ISA-NEXT: s_or_b64 s[10:11], s[10:11], s[0:1]
+; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
; ISA-NEXT: s_cbranch_execz .LBB0_1
-; ISA-NEXT: ; %bb.5: ; %endif2
-; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT: s_add_i32 s10, s10, 1
-; ISA-NEXT: s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT: ; %bb.3: ; %endif2
+; ISA-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; ISA-NEXT: v_add_u32_e32 v1, 1, v1
+; ISA-NEXT: s_andn2_b64 s[8:9], s[8:9], exec
+; ISA-NEXT: s_andn2_b64 s[10:11], s[10:11], exec
; ISA-NEXT: s_branch .LBB0_1
-; ISA-NEXT: .LBB0_6: ; %Flow2
-; ISA-NEXT: s_or_b64 exec, exec, s[0:1]
+; ISA-NEXT: .LBB0_4: ; %Flow2
+; ISA-NEXT: s_or_b64 exec, exec, s[4:5]
; ISA-NEXT: v_mov_b32_e32 v1, 0
-; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
-; ISA-NEXT: ; %bb.7: ; %if1
+; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[6:7]
+; ISA-NEXT: ; %bb.5: ; %if1
; ISA-NEXT: v_sqrt_f32_e32 v1, v0
-; ISA-NEXT: ; %bb.8: ; %endloop
+; ISA-NEXT: ; %bb.6: ; %endloop
; ISA-NEXT: s_or_b64 exec, exec, s[0:1]
; ISA-NEXT: exp mrt0 v1, v1, v1, v1 done vm
; ISA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/jump-address.ll b/llvm/test/CodeGen/AMDGPU/jump-address.ll
index d58db378e1384..557536aa45483 100644
--- a/llvm/test/CodeGen/AMDGPU/jump-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/jump-address.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
-; CHECK: JUMP @6
+; CHECK: JUMP @3
; CHECK: EXPORT
; CHECK-NOT: EXPORT
diff --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll
index 6a23875c18241..c5ef622d3aaf8 100644
--- a/llvm/test/CodeGen/AMDGPU/predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/predicates.ll
@@ -45,10 +45,8 @@ ENDIF:
}
; CHECK-LABEL: {{^}}nested_if:
-; CHECK: ALU_PUSH_BEFORE
-; CHECK: JUMP
-; CHECK: POP
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
+; CHECK: ALU
+; CHECK: CNDGT_INT
; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
define amdgpu_kernel void @nested_if(ptr addrspace(1) %out, i32 %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989a2507..b0ceae03b013c 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -391,30 +391,25 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_chain:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v4
+; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[10:11]
+; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
; GFX942-NEXT: s_cbranch_execz .LBB8_2
-; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2
-; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX942-NEXT: .LBB8_2: ; %Flow
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX942-NEXT: s_cbranch_execz .LBB8_4
-; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: ; %bb.1: ; %bb.2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT: .LBB8_4: ; %bb.3
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT: .LBB8_2: ; %bb.3
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_waitcnt vmcnt(1)
; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
@@ -449,36 +444,26 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v4
+; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[10:11]
+; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
; GFX942-NEXT: s_cbranch_execz .LBB9_2
-; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
-; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX942-NEXT: .LBB9_2: ; %Flow
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX942-NEXT: s_cbranch_execz .LBB9_4
-; GFX942-NEXT: ; %bb.3: ; %bb.2
+; GFX942-NEXT: ; %bb.1: ; %bb.2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX942-NEXT: .LBB9_4: ; %bb.3
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT: .LBB9_2: ; %bb.3
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -507,88 +492,30 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_const:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16
+; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v4
+; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2
-; GFX942-NEXT: ; implicit-def: $vgpr12
-; GFX942-NEXT: ; implicit-def: $vgpr10
-; GFX942-NEXT: ; implicit-def: $vgpr13
-; GFX942-NEXT: ; implicit-def: $vgpr14
-; GFX942-NEXT: ; implicit-def: $vgpr11
-; GFX942-NEXT: ; implicit-def: $vgpr15
-; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[10:11]
+; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
; GFX942-NEXT: s_cbranch_execz .LBB10_2
-; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16
-; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v4, 8
-; GFX942-NEXT: v_mov_b32_e32 v5, 7
-; GFX942-NEXT: v_mov_b32_e32 v6, 6
-; GFX942-NEXT: v_mov_b32_e32 v1, 5
-; GFX942-NEXT: v_mov_b32_e32 v7, 4
-; GFX942-NEXT: v_mov_b32_e32 v8, 3
-; GFX942-NEXT: v_mov_b32_e32 v9, 2
-; GFX942-NEXT: v_mov_b32_e32 v0, 1
-; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT: ; %bb.1: ; %bb.2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2
-; GFX942-NEXT: .LBB10_2: ; %Flow
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX942-NEXT: s_cbranch_execz .LBB10_4
-; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9
-; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7
-; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4
-; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6
-; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13]
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v12, v9
-; GFX942-NEXT: v_mov_b32_e32 v10, v8
-; GFX942-NEXT: v_mov_b32_e32 v13, v7
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v14, v6
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: v_mov_b32_e32 v15, v4
-; GFX942-NEXT: .LBB10_4: ; %bb.3
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12
-; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13
-; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15
-; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14
-; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x8070605
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x4030201
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT: .LBB10_2: ; %bb.3
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +544,27 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
+; GFX942-NEXT: s_and_b64 s[2:3], s[0:1], vcc
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
-; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX942-NEXT: s_cbranch_execz .LBB11_4
-; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5
-; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX942-NEXT: s_cbranch_execz .LBB11_3
-; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13]
-; GFX942-NEXT: .LBB11_3: ; %Flow
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: .LBB11_4: ; %bb.3
+; GFX942-NEXT: v_cndmask_b32_e64 v5, v1, v3, s[0:1]
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v0, v2, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-NEXT: s_cbranch_execz .LBB11_2
+; GFX942-NEXT: ; %bb.1: ; %bb.2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: .LBB11_2: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
>From e531e5238894e6f8acb53368740c835a07c87be4 Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Sat, 14 Jun 2025 14:25:13 -0700
Subject: [PATCH 4/4] Complete checking in the test case
---
.../Transforms/SimplifyCFG/flatten-cfg-with-phi.ll | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
index cf7c3f566e8df..7f2d12eba5b73 100644
--- a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
+++ b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
@@ -12,7 +12,16 @@ define i1 @_Z7compareRK1SS1_(ptr %a, ptr %b) {
; CHECK-NEXT: %2 = and i1 %cmp.i, %cmp.i19
; CHECK-NEXT: %3 = select i1 %cmp.i, i1 false, i1 true
; CHECK-NEXT: br i1 %2, label %land.rhs, label %lor.end
-; CHECK-LABEL: lor.end: ; preds = %land.rhs, %entry
+; CHECK-EMPTY:
+; CHECK-NEXT: land.rhs: ; preds = %entry
+; CHECK-NEXT: %y = getelementptr inbounds nuw i8, ptr %a, i64 4
+; CHECK-NEXT: %4 = load i32, ptr %y, align 4, !tbaa !8
+; CHECK-NEXT: %y14 = getelementptr inbounds nuw i8, ptr %b, i64 4
+; CHECK-NEXT: %5 = load i32, ptr %y14, align 4, !tbaa !8
+; CHECK-NEXT: %cmp = icmp slt i32 %4, %5
+; CHECK-NEXT: br label %lor.end
+; CHECK-EMPTY:
+; CHECK-NEXT: lor.end: ; preds = %land.rhs, %entry
; CHECK-NEXT: %6 = phi i1 [ %cmp, %land.rhs ], [ %3, %entry ]
; CHECK-NEXT: ret i1 %6
entry:
More information about the llvm-commits
mailing list