[llvm] [FlattenCFG] Fixup Phi nodes during CFG flattening (PR #143766)

Sat Jun 14 14:25:30 PDT 2025

https://github.com/HighW4y2H3ll updated https://github.com/llvm/llvm-project/pull/143766

>From 7194cc0a4e20d1ad5f90c1faf70ad4beca49f05b Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Fri, 13 Jun 2025 16:15:00 -0700
Subject: [PATCH 1/4] [FlattenCFG] Flatten CFG with Phi nodes

---
 llvm/lib/Transforms/Utils/FlattenCFG.cpp      | 51 ++++++++++++------
 .../SimplifyCFG/flatten-cfg-with-phi.ll       | 53 +++++++++++++++++++
 2 files changed, 89 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll

diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 1d9408d6db433..cde9b040224cc 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -134,10 +134,6 @@ class FlattenCFGOpt {
 ///  its predecessor.  In Case 2, BB (BB3) only has conditional branches
 ///  as its predecessors.
 bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
-  PHINode *PHI = dyn_cast<PHINode>(BB->begin());
-  if (PHI)
-    return false; // For simplicity, avoid cases containing PHI nodes.
-
   BasicBlock *LastCondBlock = nullptr;
   BasicBlock *FirstCondBlock = nullptr;
   BasicBlock *UnCondBlock = nullptr;
@@ -208,8 +204,10 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 
     if (Idx == -1)
       Idx = CIdx;
-    else if (CIdx != Idx)
-      return false;
+    else if (CIdx != Idx) {
+      // Inverse Branch Condition
+      InvertBranch(PBI, Builder);
+    }
 
     // PS is the successor which is not BB. Check successors to identify
     // the last conditional branch.
@@ -269,11 +267,6 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   if (!PBI1 || !PBI1->isUnconditional())
     return false;
 
-  // PS2 should not contain PHI node.
-  PHI = dyn_cast<PHINode>(PS2->begin());
-  if (PHI)
-    return false;
-
   // Do the transformation.
   BasicBlock *CB;
   BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
@@ -291,17 +284,45 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     // Merge conditions.
     Builder.SetInsertPoint(PBI);
     Value *NC;
-    if (Idx == 0)
-      // Case 2, use parallel or.
-      NC = Builder.CreateOr(PC, CC);
-    else
+    if (UnCondBlock)
       // Case 1, use parallel and.
       NC = Builder.CreateAnd(PC, CC);
+    else
+      // Case 2, use parallel or.
+      NC = Builder.CreateOr(PC, CC);
+
+    // Fixup PHI node if needed
+    for (BasicBlock *CBS : successors(PBI)) {
+      for (PHINode &Phi : CBS->phis()) {
+        Value *origPhi0 = nullptr;
+        Value *newPhi = nullptr;
+        if (llvm::is_contained(Phi.blocks(), FirstCondBlock)) {
+          origPhi0 = Phi.removeIncomingValue(FirstCondBlock, false);
+          newPhi = origPhi0;
+        }
+        if (llvm::is_contained(Phi.blocks(), CB)) {
+          Value *origPhi1 = Phi.removeIncomingValue(CB, false);
+          newPhi = origPhi1;
+
+          if (origPhi0) {
+            // Swap branch given the conditions
+            if (PBI->getSuccessor(0) == CBS) {
+              newPhi = Builder.CreateSelect(PC, origPhi0, origPhi1);
+            } else {
+              newPhi = Builder.CreateSelect(PC, origPhi1, origPhi0);
+            }
+          }
+        }
+        if (newPhi)
+          Phi.addIncoming(newPhi, FirstCondBlock);
+      }
+    }
 
     PBI->replaceUsesOfWith(CC, NC);
     PC = NC;
     if (CB == LastCondBlock)
       Iteration = false;
+
     // Remove internal conditional branches.
     CB->dropAllReferences();
     // make CB unreachable and let downstream to delete the block.
diff --git a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
new file mode 100644
index 0000000000000..cf7c3f566e8df
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool ./opt --version 5
+; RUN: opt < %s -passes=flatten-cfg -S | FileCheck %s
+
+define i1 @_Z7compareRK1SS1_(ptr %a, ptr %b) {
+; CHECK-LABEL: @_Z7compareRK1SS1_(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = load i32, ptr %a, align 4, !tbaa !3
+; CHECK-NEXT:   %1 = load i32, ptr %b, align 4, !tbaa !3
+; CHECK-NEXT:   %cmp.i = icmp sge i32 %0, %1
+; CHECK-NEXT:   %cmp.i19 = icmp eq i32 %0, %1
+; CHECK-NEXT:   %2 = and i1 %cmp.i, %cmp.i19
+; CHECK-NEXT:   %3 = select i1 %cmp.i, i1 false, i1 true
+; CHECK-NEXT:   br i1 %2, label %land.rhs, label %lor.end
+; CHECK-LABEL: lor.end:                                          ; preds = %land.rhs, %entry
+; CHECK-NEXT:   %6 = phi i1 [ %cmp, %land.rhs ], [ %3, %entry ]
+; CHECK-NEXT:   ret i1 %6
+entry:
+  %0 = load i32, ptr %a, align 4, !tbaa !3
+  %1 = load i32, ptr %b, align 4, !tbaa !3
+  %cmp.i = icmp slt i32 %0, %1
+  br i1 %cmp.i, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  %cmp.i19 = icmp eq i32 %0, %1
+  br i1 %cmp.i19, label %land.rhs, label %lor.end
+
+land.rhs:                                         ; preds = %lor.rhs
+  %y = getelementptr inbounds nuw i8, ptr %a, i64 4
+  %2 = load i32, ptr %y, align 4, !tbaa !8
+  %y14 = getelementptr inbounds nuw i8, ptr %b, i64 4
+  %3 = load i32, ptr %y14, align 4, !tbaa !8
+  %cmp = icmp slt i32 %2, %3
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %land.rhs, %entry
+  %4 = phi i1 [ true, %entry ], [ false, %lor.rhs ], [ %cmp, %land.rhs ]
+  ret i1 %4
+}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{!"clang"}
+!3 = !{!4, !5, i64 0}
+!4 = !{!"_ZTS1S", !5, i64 0, !5, i64 4}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!4, !5, i64 4}
+!9 = !{!5, !5, i64 0}

>From 70d14b5154d3c1b2690615607d34c63292a8eae0 Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Sat, 14 Jun 2025 11:28:08 -0700
Subject: [PATCH 2/4] Set insertation point before calling InvertBranch

---
 llvm/lib/Transforms/Utils/FlattenCFG.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index cde9b040224cc..9a1978c7eef2c 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -206,6 +206,8 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
       Idx = CIdx;
     else if (CIdx != Idx) {
       // Inverse Branch Condition
+      IRBuilder<>::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(PBI);
       InvertBranch(PBI, Builder);
     }
 

>From b60caca0e302129459982924c013b012fbdd070e Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Sat, 14 Jun 2025 10:32:33 -0700
Subject: [PATCH 3/4] Fix AMDGPU tests

---
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   | 124 +++++++---
 ...der-no-live-segment-at-def-implicit-def.ll |  66 +++---
 .../divergent-branch-uniform-condition.ll     |  72 +++---
 llvm/test/CodeGen/AMDGPU/jump-address.ll      |   2 +-
 llvm/test/CodeGen/AMDGPU/predicates.ll        |   6 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 212 ++++++------------
 6 files changed, 219 insertions(+), 263 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcde..7dcf3dd620030 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -480,29 +480,58 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-LABEL: v8i8_phi_chain:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e64 s[0:1], 7, v0
+; GFX906-NEXT:    s_or_b64 s[2:3], vcc, s[0:1]
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[8:9]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[8:9]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[10:11]
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[10:11]
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT:    s_cbranch_execz .LBB8_4
-; GFX906-NEXT:  ; %bb.3: ; %bb.2
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:  ; %bb.1: ; %bb.2
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX906-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX906-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX906-NEXT:    v_cndmask_b32_e32 v4, v13, v7, vcc
+; GFX906-NEXT:    v_cndmask_b32_e32 v7, v15, v9, vcc
+; GFX906-NEXT:    v_mov_b32_e32 v9, 8
+; GFX906-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX906-NEXT:    v_cndmask_b32_e32 v3, v11, v5, vcc
+; GFX906-NEXT:    v_cndmask_b32_e32 v5, v12, v6, vcc
+; GFX906-NEXT:    v_cndmask_b32_e32 v6, v14, v8, vcc
+; GFX906-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v8, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v5
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    v_or3_b32 v1, v0, v1, v3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v2, v8, v0
+; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    v_or3_b32 v2, v0, v2, v3
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[12:13]
-; GFX906-NEXT:  .LBB8_4: ; %bb.3
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:  .LBB8_2: ; %bb.3
+; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[14:15]
@@ -535,29 +564,50 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT:    v_cmp_gt_u32_e64 s[0:1], 15, v0
+; GFX906-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[8:9]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[8:9]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[10:11]
+; GFX906-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v6, 8
+; GFX906-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v1, v3
-; GFX906-NEXT:    v_mov_b32_e32 v2, v4
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_4
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[10:11]
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_3
-; GFX906-NEXT:  ; %bb.2: ; %bb.2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX906-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
+; GFX906-NEXT:    v_cndmask_b32_sdwa v9, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX906-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[0:1]
+; GFX906-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[0:1]
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT:    v_cndmask_b32_e64 v11, v2, v4, s[0:1]
+; GFX906-NEXT:    v_cndmask_b32_sdwa v10, v1, v3, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v0, v5, v7
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX906-NEXT:    v_cndmask_b32_sdwa v8, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX906-NEXT:    v_and_or_b32 v6, v11, v5, v6
+; GFX906-NEXT:    v_or3_b32 v5, v0, v7, v10
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v8
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    v_cndmask_b32_sdwa v7, v2, v4, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX906-NEXT:    v_or3_b32 v6, v6, v0, v7
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX906-NEXT:    s_cbranch_execz .LBB9_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.2
+; GFX906-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[12:13]
-; GFX906-NEXT:  .LBB9_3: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:  .LBB9_4: ; %bb.3
+; GFX906-NEXT:    v_mov_b32_e32 v5, v3
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[12:13]
+; GFX906-NEXT:  .LBB9_2: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[14:15]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[5:6], s[14:15]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index ad0d6d8016ad6..19f3705c97d4b 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -12,73 +12,59 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_load_dwordx8 s[48:55], s[8:9], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b32 s12, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s52, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_9
-; CHECK-NEXT:  ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT:    s_cmp_eq_u32 s54, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_4
-; CHECK-NEXT:  ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT:    s_cmp_lg_u32 s55, 0
-; CHECK-NEXT:    s_mov_b32 s17, 0
-; CHECK-NEXT:    s_cselect_b32 s12, -1, 0
-; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
-; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    s_mov_b32 s18, 0
-; CHECK-NEXT:    s_branch .LBB0_6
-; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    s_mov_b32 s14, s12
-; CHECK-NEXT:    s_mov_b32 s15, s12
-; CHECK-NEXT:    s_mov_b32 s13, s12
-; CHECK-NEXT:    s_mov_b64 s[50:51], s[14:15]
-; CHECK-NEXT:    s_mov_b64 s[48:49], s[12:13]
-; CHECK-NEXT:    s_branch .LBB0_8
-; CHECK-NEXT:  .LBB0_5: ; %if.then263.i.i
+; CHECK-NEXT:  ; %bb.1: ; %if.end13.i.i
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 s12, s53, 0
-; CHECK-NEXT:    s_mov_b32 s18, 1.0
-; CHECK-NEXT:    s_mov_b32 s17, 0x7fc00000
-; CHECK-NEXT:  .LBB0_6: ; %Flow
-; CHECK-NEXT:    s_mov_b32 s48, 1.0
-; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT:    s_cmp_lg_u32 s55, 0
+; CHECK-NEXT:    s_cselect_b32 s17, -1, 0
+; CHECK-NEXT:    s_or_b32 s12, s17, s12
+; CHECK-NEXT:    s_cmp_lg_u32 s54, 0
+; CHECK-NEXT:    s_cselect_b32 s13, -1, 0
+; CHECK-NEXT:    s_and_b32 s18, s13, exec_lo
+; CHECK-NEXT:    s_cselect_b32 s48, 1.0, 0
+; CHECK-NEXT:    s_and_b32 s12, s13, s12
 ; CHECK-NEXT:    s_mov_b32 s49, s48
 ; CHECK-NEXT:    s_mov_b32 s50, s48
+; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
 ; CHECK-NEXT:    s_mov_b32 s51, s48
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_8
-; CHECK-NEXT:  ; %bb.7: ; %if.end273.i.i
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_3
+; CHECK-NEXT:  ; %bb.2: ; %if.end273.i.i
 ; CHECK-NEXT:    s_add_u32 s12, s8, 40
 ; CHECK-NEXT:    s_addc_u32 s13, s9, 0
-; CHECK-NEXT:    s_getpc_b64 s[20:21]
-; CHECK-NEXT:    s_add_u32 s20, s20, _Z3dotDv3_fS_ at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s21, s21, _Z3dotDv3_fS_ at gotpcrel32@hi+12
+; CHECK-NEXT:    s_getpc_b64 s[18:19]
+; CHECK-NEXT:    s_add_u32 s18, s18, _Z3dotDv3_fS_ at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s19, s19, _Z3dotDv3_fS_ at gotpcrel32@hi+12
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0, s17
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, 0, s17
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[20:21], s[20:21], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 10, v1
-; CHECK-NEXT:    v_add_f32_e64 v1, s17, s18
+; CHECK-NEXT:    v_lshlrev_b32_e32 v5, 10, v1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; CHECK-NEXT:    v_add_f32_e32 v1, v4, v3
 ; CHECK-NEXT:    s_mov_b32 s12, s14
-; CHECK-NEXT:    v_or3_b32 v31, v0, v3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v0, v1
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_or3_b32 v31, v0, v5, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_mov_b32 s13, s15
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_mov_b32 s14, s16
 ; CHECK-NEXT:    s_mov_b32 s48, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[34:35]
 ; CHECK-NEXT:    s_mov_b32 s49, s48
 ; CHECK-NEXT:    s_mov_b32 s50, s48
 ; CHECK-NEXT:    s_mov_b32 s51, s48
-; CHECK-NEXT:  .LBB0_8: ; %if.end294.i.i
+; CHECK-NEXT:  .LBB0_3: ; %if.end294.i.i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; CHECK-NEXT:  .LBB0_9: ; %kernel_direct_lighting.exit
+; CHECK-NEXT:  .LBB0_4: ; %kernel_direct_lighting.exit
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x20
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s48
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 827cb4ac2589a..98aa1f0849eec 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -20,50 +20,48 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA:       ; %bb.0: ; %start
 ; ISA-NEXT:    v_readfirstlane_b32 s0, v0
 ; ISA-NEXT:    s_mov_b32 m0, s0
-; ISA-NEXT:    s_mov_b32 s10, 0
+; ISA-NEXT:    s_mov_b64 s[4:5], 0
 ; ISA-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
 ; ISA-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
-; ISA-NEXT:    s_mov_b64 s[0:1], 0
-; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; ISA-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; ISA-NEXT:    s_branch .LBB0_3
-; ISA-NEXT:  .LBB0_1: ; %Flow1
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_or_b64 exec, exec, s[4:5]
-; ISA-NEXT:    s_mov_b64 s[8:9], 0
-; ISA-NEXT:    s_mov_b64 s[4:5], s[6:7]
-; ISA-NEXT:  .LBB0_2: ; %Flow
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_and_b64 s[6:7], exec, s[4:5]
-; ISA-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
-; ISA-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; ISA-NEXT:    s_and_b64 s[6:7], s[8:9], exec
-; ISA-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; ISA-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; ISA-NEXT:    s_cbranch_execz .LBB0_6
-; ISA-NEXT:  .LBB0_3: ; %loop
+; ISA-NEXT:    v_mov_b32_e32 v1, 0
+; ISA-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; ISA-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    s_branch .LBB0_2
+; ISA-NEXT:  .LBB0_1: ; %Flow
+; ISA-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
+; ISA-NEXT:    s_and_b64 s[0:1], exec, s[8:9]
+; ISA-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; ISA-NEXT:    s_andn2_b64 s[0:1], s[6:7], exec
+; ISA-NEXT:    s_and_b64 s[2:3], s[10:11], exec
+; ISA-NEXT:    s_or_b64 s[6:7], s[0:1], s[2:3]
+; ISA-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; ISA-NEXT:    s_cbranch_execz .LBB0_4
+; ISA-NEXT:  .LBB0_2: ; %loop
 ; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], exec
-; ISA-NEXT:    s_mov_b64 s[6:7], -1
-; ISA-NEXT:    s_cmp_lt_u32 s10, 32
-; ISA-NEXT:    s_mov_b64 s[8:9], -1
-; ISA-NEXT:    s_cbranch_scc0 .LBB0_2
-; ISA-NEXT:  ; %bb.4: ; %endif1
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; ISA-NEXT:    v_cmp_lt_u32_e64 s[0:1], 31, v1
+; ISA-NEXT:    v_cmp_gt_u32_e64 s[2:3], 32, v1
+; ISA-NEXT:    s_andn2_b64 s[10:11], s[10:11], exec
+; ISA-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; ISA-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; ISA-NEXT:    s_or_b64 s[8:9], s[8:9], exec
+; ISA-NEXT:    s_or_b64 s[10:11], s[10:11], s[0:1]
+; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
 ; ISA-NEXT:    s_cbranch_execz .LBB0_1
-; ISA-NEXT:  ; %bb.5: ; %endif2
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_add_i32 s10, s10, 1
-; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:  ; %bb.3: ; %endif2
+; ISA-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; ISA-NEXT:    v_add_u32_e32 v1, 1, v1
+; ISA-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
+; ISA-NEXT:    s_andn2_b64 s[10:11], s[10:11], exec
 ; ISA-NEXT:    s_branch .LBB0_1
-; ISA-NEXT:  .LBB0_6: ; %Flow2
-; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
+; ISA-NEXT:  .LBB0_4: ; %Flow2
+; ISA-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; ISA-NEXT:    v_mov_b32_e32 v1, 0
-; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
-; ISA-NEXT:  ; %bb.7: ; %if1
+; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[6:7]
+; ISA-NEXT:  ; %bb.5: ; %if1
 ; ISA-NEXT:    v_sqrt_f32_e32 v1, v0
-; ISA-NEXT:  ; %bb.8: ; %endloop
+; ISA-NEXT:  ; %bb.6: ; %endloop
 ; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; ISA-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
 ; ISA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/jump-address.ll b/llvm/test/CodeGen/AMDGPU/jump-address.ll
index d58db378e1384..557536aa45483 100644
--- a/llvm/test/CodeGen/AMDGPU/jump-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/jump-address.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: JUMP @6
+; CHECK: JUMP @3
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT
 
diff --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll
index 6a23875c18241..c5ef622d3aaf8 100644
--- a/llvm/test/CodeGen/AMDGPU/predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/predicates.ll
@@ -45,10 +45,8 @@ ENDIF:
 }
 
 ; CHECK-LABEL: {{^}}nested_if:
-; CHECK: ALU_PUSH_BEFORE
-; CHECK: JUMP
-; CHECK: POP
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
+; CHECK: ALU
+; CHECK: CNDGT_INT
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 define amdgpu_kernel void @nested_if(ptr addrspace(1) %out, i32 %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989a2507..b0ceae03b013c 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -391,30 +391,25 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-LABEL: v8i8_phi_chain:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
+; GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v4
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 7, v4
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[8:9]
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v5, s[10:11]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[0:1]
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
 ; GFX942-NEXT:    s_cbranch_execz .LBB8_2
-; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v2
-; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX942-NEXT:  .LBB8_2: ; %Flow
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX942-NEXT:    s_cbranch_execz .LBB8_4
-; GFX942-NEXT:  ; %bb.3: ; %bb.2
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:  ; %bb.1: ; %bb.2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT:  .LBB8_4: ; %bb.3
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:  .LBB8_2: ; %bb.3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
@@ -449,36 +444,26 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v4
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 7, v4
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v5, s[10:11]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[0:1]
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
 ; GFX942-NEXT:    s_cbranch_execz .LBB9_2
-; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v4
-; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX942-NEXT:  .LBB9_2: ; %Flow
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX942-NEXT:    s_cbranch_execz .LBB9_4
-; GFX942-NEXT:  ; %bb.3: ; %bb.2
+; GFX942-NEXT:  ; %bb.1: ; %bb.2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
-; GFX942-NEXT:  .LBB9_4: ; %bb.3
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:  .LBB9_2: ; %bb.3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -507,88 +492,30 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-LABEL: v8i8_phi_const:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v16, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v16
-; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v16
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v16
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
+; GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, 14, v4
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 7, v4
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[8:9]
-; GFX942-NEXT:    ; implicit-def: $vgpr2
-; GFX942-NEXT:    ; implicit-def: $vgpr12
-; GFX942-NEXT:    ; implicit-def: $vgpr10
-; GFX942-NEXT:    ; implicit-def: $vgpr13
-; GFX942-NEXT:    ; implicit-def: $vgpr14
-; GFX942-NEXT:    ; implicit-def: $vgpr11
-; GFX942-NEXT:    ; implicit-def: $vgpr15
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v5, s[10:11]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[0:1]
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
 ; GFX942-NEXT:    s_cbranch_execz .LBB10_2
-; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v3, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v16
-; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    v_mov_b32_e32 v4, 8
-; GFX942-NEXT:    v_mov_b32_e32 v5, 7
-; GFX942-NEXT:    v_mov_b32_e32 v6, 6
-; GFX942-NEXT:    v_mov_b32_e32 v1, 5
-; GFX942-NEXT:    v_mov_b32_e32 v7, 4
-; GFX942-NEXT:    v_mov_b32_e32 v8, 3
-; GFX942-NEXT:    v_mov_b32_e32 v9, 2
-; GFX942-NEXT:    v_mov_b32_e32 v0, 1
-; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:  ; %bb.1: ; %bb.2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
-; GFX942-NEXT:  .LBB10_2: ; %Flow
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX942-NEXT:    s_cbranch_execz .LBB10_4
-; GFX942-NEXT:  ; %bb.3: ; %bb.2
-; GFX942-NEXT:    v_lshlrev_b16_e32 v2, 8, v9
-; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v7
-; GFX942-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
-; GFX942-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
-; GFX942-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mov_b32_e32 v10, 0
-; GFX942-NEXT:    v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    global_store_dwordx2 v10, v[2:3], s[12:13]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v9
-; GFX942-NEXT:    v_mov_b32_e32 v10, v8
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
-; GFX942-NEXT:    v_mov_b32_e32 v15, v4
-; GFX942-NEXT:  .LBB10_4: ; %bb.3
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_lshlrev_b16_e32 v0, 8, v12
-; GFX942-NEXT:    v_lshlrev_b16_e32 v1, 8, v13
-; GFX942-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
-; GFX942-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
-; GFX942-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x8070605
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x4030201
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT:  .LBB10_2: ; %bb.3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +544,27 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX942-LABEL: v8i8_multi_block:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 15, v4
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v5, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
+; GFX942-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
-; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX942-NEXT:    s_cbranch_execz .LBB11_4
-; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v5
-; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX942-NEXT:    s_cbranch_execz .LBB11_3
-; GFX942-NEXT:  ; %bb.2: ; %bb.2
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[12:13]
-; GFX942-NEXT:  .LBB11_3: ; %Flow
-; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:  .LBB11_4: ; %bb.3
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v1, v3, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v0, v2, s[0:1]
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT:  .LBB11_2: ; %bb.3
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    global_store_dwordx2 v0, v[4:5], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

>From e531e5238894e6f8acb53368740c835a07c87be4 Mon Sep 17 00:00:00 2001
From: h2h <h2h at meta.com>
Date: Sat, 14 Jun 2025 14:25:13 -0700
Subject: [PATCH 4/4] Complete checking in the test case

---
 .../Transforms/SimplifyCFG/flatten-cfg-with-phi.ll    | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
index cf7c3f566e8df..7f2d12eba5b73 100644
--- a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
+++ b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll
@@ -12,7 +12,16 @@ define i1 @_Z7compareRK1SS1_(ptr %a, ptr %b) {
 ; CHECK-NEXT:   %2 = and i1 %cmp.i, %cmp.i19
 ; CHECK-NEXT:   %3 = select i1 %cmp.i, i1 false, i1 true
 ; CHECK-NEXT:   br i1 %2, label %land.rhs, label %lor.end
-; CHECK-LABEL: lor.end:                                          ; preds = %land.rhs, %entry
+; CHECK-EMPTY:
+; CHECK-NEXT: land.rhs:  ; preds = %entry
+; CHECK-NEXT:   %y = getelementptr inbounds nuw i8, ptr %a, i64 4
+; CHECK-NEXT:   %4 = load i32, ptr %y, align 4, !tbaa !8
+; CHECK-NEXT:   %y14 = getelementptr inbounds nuw i8, ptr %b, i64 4
+; CHECK-NEXT:   %5 = load i32, ptr %y14, align 4, !tbaa !8
+; CHECK-NEXT:   %cmp = icmp slt i32 %4, %5
+; CHECK-NEXT:   br label %lor.end
+; CHECK-EMPTY:
+; CHECK-NEXT: lor.end:  ; preds = %land.rhs, %entry
 ; CHECK-NEXT:   %6 = phi i1 [ %cmp, %land.rhs ], [ %3, %entry ]
 ; CHECK-NEXT:   ret i1 %6
 entry: