[llvm] StructurizeCFG: Optimize phi insertion during ssa reconstruction (PR #101301)

Fri Aug 2 03:45:35 PDT 2024

https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/101301

>From 0719bcd30be96e9aa2c53cce4503f6a5c3185b20 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 31 Jul 2024 15:07:17 +0800
Subject: [PATCH 1/4] [AMDGPU,test] Add one more while-break case

which suffers from v_mov issue.
---
 llvm/test/CodeGen/AMDGPU/while-break.ll | 86 +++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 13b37b40ee95c..46254994580d2 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -152,4 +152,90 @@ end:
   ret float %r
 }
 
+; Two chains of phi network that have the same value from %if block.
+define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32 %z, ptr addrspace(1) %p) #0 {
+; GCN-LABEL: while_break_two_chains_of_phi:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_branch .LBB2_2
+; GCN-NEXT:  .LBB2_1: ; %Flow1
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
+; GCN-NEXT:    s_or_b32 s2, s1, s2
+; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    s_cbranch_execz .LBB2_6
+; GCN-NEXT:  .LBB2_2: ; %header
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_cmp_ge_i32_e64 s3, s0, v1
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v1
+; GCN-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GCN-NEXT:    s_cbranch_execz .LBB2_4
+; GCN-NEXT:  ; %bb.3: ; %if
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[0:1], 2
+; GCN-NEXT:    s_andn2_b32 s1, s3, exec_lo
+; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v4, s6
+; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s7, v5, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v2
+; GCN-NEXT:    global_load_dword v0, v[6:7], off
+; GCN-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GCN-NEXT:    s_or_b32 s3, s1, s3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-NEXT:  .LBB2_4: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    s_mov_b32 s4, -1
+; GCN-NEXT:    s_and_saveexec_b32 s1, s3
+; GCN-NEXT:    s_cbranch_execz .LBB2_1
+; GCN-NEXT:  ; %bb.5: ; %latch
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
+; GCN-NEXT:    v_mov_b32_e32 v7, v0
+; GCN-NEXT:    s_add_i32 s0, s0, 1
+; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
+; GCN-NEXT:    s_branch .LBB2_1
+; GCN-NEXT:  .LBB2_6: ; %end
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, v7
+; GCN-NEXT:    v_mov_b32_e32 v1, v6
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  br label %header
+
+header:
+  %v.1 = phi float [ %v, %entry ], [ %v.2, %latch ]
+  %v.copy = phi float [ 0.0, %entry ], [ %v.copy.2, %latch ]
+  %ind = phi i32 [ 0, %entry], [ %ind.inc, %latch ]
+  %cc = icmp slt i32 %ind, %x
+  br i1 %cc, label %if, label %latch
+
+if:
+  %v.ptr = getelementptr float, ptr addrspace(1) %p, i32 %ind
+  %v.load = load float, ptr addrspace(1) %v.ptr
+  %v.if = fadd float %v.load, 1.0
+  %cc2 = icmp slt i32 %ind, %y
+  br i1 %cc2, label %latch, label %end
+
+latch:
+  %v.2 = phi float [ %v.1, %header ], [ %v.if, %if ]
+  %v.copy.2 = phi float [ %v.copy, %header ], [ %v.if, %if ]
+  %ind.inc = add i32 %ind, 1
+  %cc3 = icmp slt i32 %ind, %z
+  br i1 %cc3, label %end, label %header
+
+end:
+  %r = phi float [ %v.2, %latch ], [ %v.if, %if ]
+  %r2 = phi float [ %v.copy.2, %latch ], [ %v.if, %if ]
+  %packed0 = insertelement < 2 x float > poison, float %r, i32 0
+  %packed1 = insertelement < 2 x float > %packed0, float %r2, i32 1
+  ret < 2 x float> %packed1
+}
+
 attributes #0 = { nounwind }

>From 5ae35eb6755f729d16bac5cd646807e862ac4f6a Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 30 Jul 2024 14:04:13 +0800
Subject: [PATCH 2/4] StructurizeCFG: Optimize phi insertion during ssa
 reconstruction

After investigating more while-break cases, I think we should try to optimize
the way we reconstruct phi nodes. Previously, we reconstruct each phi
nodes separately, but this is not optimal. For example:

```
header:
  %v.1 = phi float [ %v, %entry ], [ %v.2, %latch ]
  br i1 %cc, label %if, label %latch

if:
  %v.if = fadd float %v.1, 1.0
  br i1 %cc2, label %latch, label %exit

latch:
  %v.2 = phi float [ %v.if, %if ], [ %v.1, %header ]
  br i1 %cc3, label %exit, label %header

exit:
  %v.3 = phi float [ %v.2, %latch ], [ %v.if, %if ]
```

For this case, we have different copies of value `v`, but there is at
most one copy of value `v` alive at any program point shown above.

The existing ssa reconstruction will use the incoming values from the
old deleted phi. Below is a possible output after ssa reconstruction.

```
header:
  %v.1 = phi float [ %v, %entry ], [ %v.loop, %Flow1 ]
  br i1 %cc, label %if, label %flow

if:
  %v.if = fadd float %v.1, 1.0
  br label %flow

flow:
  %v.exit.if = phi float [ %v.if, %if ], [ undef, %header ]
  %v.latch = phi float [ %v.if, %if ], [ %v.1, %header ]

latch:
  br label %flow1

flow1:
  %v.loop = phi float [ %v.latch, %latch ], [ undef, %Flow ]
  %v.exit = phi float [ %v.latch, %latch ], [ %v.exit.if, %Flow ]

exit:
  %v.3 = phi float [ %v.exit, %flow1 ]
```

If we look closely, in order to reconstruct `v.1` `v.2` `v.3`, we are
having two simultaneous copies of `v` alive at `flow` and `flow1`.
We highly depend on register coalescer to coalesce them together.
But register coalescer may not always be able to coalesce them
because of the complexity in the chain of phi.

On the other side, now that we have only one copy of `v` alive at any
program point before the transform, why not simplify the phi network
as much as we can? Look at the incoming values of these PHIs:
      header    if     latch
v.1:   --       --      v.2
v.2:   v.1      v.if    --
v.3:   --       v.if    v.2

If we let them share the same incoming values for these three different
incoming blocks, then we would have only one copy of alive `v` at any
program point after ssa reconstruction. Something like:

```
header:
  %v.1 = phi float [ %v, %entry ], [ %v.2, %Flow1 ]
  br i1 %cc, label %if, label %flow

if:
  %v.if = fadd float %v.1, 1.0
  br label %flow

flow:
  %v.2 = phi float [ %v.if, %if ], [ %v.1, %header ]

latch:
  br label %flow1

flow1:
  ...

exit:
  %v.3 = phi float [ %v.2, %flow1 ]
```
---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 126 +++++++++++++++++-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  26 ++--
 llvm/test/CodeGen/AMDGPU/while-break.ll       |  13 +-
 .../AMDGPU/loop-subregion-misordered.ll       |  23 ++--
 4 files changed, 144 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 9c711ec183821..5d4969dea66bc 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -288,6 +288,10 @@ class StructurizeCFG {
   void findUndefBlocks(BasicBlock *PHIBlock,
                        const SmallSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
+
+  bool isCompatible(const BBValueVector &IncomingA,
+                    const BBValueVector &IncomingB, BBValueVector &Merged);
+
   void setPhiValues();
 
   void simplifyAffectedPhis();
@@ -710,10 +714,107 @@ void StructurizeCFG::findUndefBlocks(
   }
 }
 
+// Return true if two PHI nodes have compatible incoming values (for each
+// incoming block, either they have the same incoming value or only one PHI
+// node has a incoming value). And return the union of the incoming values
+// through \p Merged.
+bool StructurizeCFG::isCompatible(const BBValueVector &IncomingA,
+                                  const BBValueVector &IncomingB,
+                                  BBValueVector &Merged) {
+  MapVector<BasicBlock *, Value *> UnionSet;
+  for (auto &V : IncomingA)
+    UnionSet.insert(V);
+
+  for (auto &V : IncomingB) {
+    if (UnionSet.contains(V.first) && UnionSet[V.first] != V.second)
+      return false;
+    // Either IncomingA does not have this value or IncomingA has the same
+    // value.
+    UnionSet.insert(V);
+  }
+
+  Merged.clear();
+  Merged.append(UnionSet.takeVector());
+  return true;
+}
+
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
+
+  DenseMap<PHINode *, std::shared_ptr<BBValueVector>> MergedPHIMap;
+  // Find out phi nodes that have compatible incoming values (either they have
+  // the same value for the same block or one have undefined value, see example
+  // below). We only search again the phi's that are referenced by another phi,
+  // which is the cases we care.
+  //
+  // For example (-- means no incoming value):
+  // phi1 : BB1:phi2   BB2:v  BB3:--
+  // phi2:  BB1:--     BB2:v  BB3:w
+  //
+  // Then we can merge these incoming values and let phi1, phi2 use the
+  // same set of incoming values:
+  //
+  // phi1&phi2: BB1:phi2  BB2:v  BB3:w
+  //
+  // By doing this, phi1 and phi2 would share more intermediate phi nodes.
+  // This would help reducing number of phi nodes during SSA reconstruction and
+  // get less COPY instructions finally.
+  //
+  // This should be correct, because if a phi node does not have incoming
+  // value from certain block, this means the block is not the predecessor
+  // of the parent block, so we actually don't care its incoming value.
+  for (const auto &AddedPhi : AddedPhis) {
+    BasicBlock *To = AddedPhi.first;
+    if (!DeletedPhis.contains(To))
+      continue;
+    PhiMap &OldPhi = DeletedPhis[To];
+    for (const auto &PI : OldPhi) {
+      SmallVector<PHINode *> IncomingPHIs;
+      PHINode *Phi = PI.first;
+      for (const auto &VI : PI.second) {
+        // First, for each phi, check whether it has incoming value which is
+        // another phi.
+        if (PHINode *P = dyn_cast<PHINode>(VI.second))
+          IncomingPHIs.push_back(P);
+      }
+
+      auto GetUpdatedIncoming = [&](PHINode *Phi) {
+        return MergedPHIMap.contains(Phi) ? *MergedPHIMap[Phi]
+                                          : DeletedPhis[Phi->getParent()][Phi];
+      };
+      for (auto *OtherPhi : IncomingPHIs) {
+        // Skip phis that are not unrelated to the phi reconstruction for now.
+        if (!DeletedPhis.contains(OtherPhi->getParent()))
+          continue;
+
+        // Skip phis that were already merged with others.
+        if (MergedPHIMap.contains(Phi) && MergedPHIMap.contains(OtherPhi))
+          continue;
+
+        std::shared_ptr<BBValueVector> MergedIncomings;
+        if (MergedPHIMap.contains(Phi))
+          MergedIncomings = MergedPHIMap[Phi];
+        else if (MergedPHIMap.contains(OtherPhi))
+          MergedIncomings = MergedPHIMap[OtherPhi];
+        else
+          MergedIncomings = std::make_shared<BBValueVector>();
+
+        const auto &Incoming = GetUpdatedIncoming(Phi);
+        const auto &OtherIncoming = GetUpdatedIncoming(OtherPhi);
+        if (isCompatible(Incoming, OtherIncoming, *MergedIncomings)) {
+          // union the incoming values
+          if (!MergedPHIMap.contains(Phi))
+            MergedPHIMap.insert(std::pair(Phi, MergedIncomings));
+
+          if (!MergedPHIMap.contains(OtherPhi))
+            MergedPHIMap.insert(std::pair(OtherPhi, MergedIncomings));
+        }
+      }
+    }
+  }
+
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -731,20 +832,27 @@ void StructurizeCFG::setPhiValues() {
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
       Updater.AddAvailableValue(To, Undef);
 
-      SmallSet<BasicBlock *, 8> Incomings;
       SmallVector<BasicBlock *> ConstantPreds;
-      for (const auto &VI : PI.second) {
-        Incomings.insert(VI.first);
-        Updater.AddAvailableValue(VI.first, VI.second);
-        if (isa<Constant>(VI.second))
-          ConstantPreds.push_back(VI.first);
-      }
 
       if (!CachedUndefs) {
+        SmallSet<BasicBlock *, 8> Incomings;
+        for (const auto &VI : PI.second)
+          Incomings.insert(VI.first);
+
+        // Get the undefined blocks shared for all the phi nodes.
         findUndefBlocks(To, Incomings, UndefBlks);
         CachedUndefs = true;
       }
 
+      // Use updated incoming vector.
+      const auto &IncomingMap =
+          MergedPHIMap.contains(Phi) ? *MergedPHIMap[Phi] : PI.second;
+      for (const auto &VI : IncomingMap) {
+        Updater.AddAvailableValue(VI.first, VI.second);
+        if (isa<Constant>(VI.second))
+          ConstantPreds.push_back(VI.first);
+      }
+
       for (auto UB : UndefBlks) {
         // If this undef block is dominated by any predecessor(before
         // structurization) of reconstructed PHI with constant incoming value,
@@ -753,6 +861,10 @@ void StructurizeCFG::setPhiValues() {
         if (any_of(ConstantPreds,
                    [&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
           continue;
+        // Maybe already get a value through sharing with other phi nodes.
+        if (Updater.HasValueForBlock(UB))
+          continue;
+
         Updater.AddAvailableValue(UB, Undef);
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index fb96b9ff2952e..e0226c35cc2de 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -577,11 +577,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s8
+; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s9
-; GFX908-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v7, s9
+; GFX908-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
@@ -642,10 +642,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v12, v20, v12
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v25
 ; GFX908-NEXT:    v_add_f32_e32 v4, v4, v24
-; GFX908-NEXT:    v_add_f32_e32 v9, v9, v27
-; GFX908-NEXT:    v_add_f32_e32 v8, v8, v26
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX908-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX908-NEXT:    v_add_f32_e32 v7, v7, v27
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v26
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v14
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v15
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
@@ -655,10 +655,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; GFX908-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
@@ -744,8 +740,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -801,8 +797,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX90A-NEXT:    s_branch .LBB3_4
@@ -811,10 +807,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX90A-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 46254994580d2..9bb8a2f9f0282 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s1
 ; GCN-NEXT:    s_or_b32 s2, s1, s2
 ; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
 ; GCN-NEXT:    s_cbranch_execz .LBB2_6
@@ -190,20 +190,17 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:  .LBB2_4: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    v_mov_b32_e32 v7, v6
-; GCN-NEXT:    s_mov_b32 s4, -1
-; GCN-NEXT:    s_and_saveexec_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s1, -1
+; GCN-NEXT:    s_and_saveexec_b32 s4, s3
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
 ; GCN-NEXT:  ; %bb.5: ; %latch
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
-; GCN-NEXT:    v_mov_b32_e32 v7, v0
 ; GCN-NEXT:    s_add_i32 s0, s0, 1
-; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
+; GCN-NEXT:    s_orn2_b32 s1, vcc_lo, exec_lo
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_6: ; %end
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, v7
 ; GCN-NEXT:    v_mov_b32_e32 v1, v6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
index 10a3e65e5f57d..385e37e2750d1 100644
--- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       LOOP.HEADER:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       Flow2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
 ; CHECK:       INNER_LOOP:
 ; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,20 +66,19 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
 ; CHECK-NEXT:    br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
 ; CHECK:       Flow4:
-; CHECK-NEXT:    br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:    br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
 ; CHECK:       bb64:
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
 ; CHECK:       INCREMENT_I:
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336

>From 57071ca8f5328ddc81413d4b1ba022f44ed191a6 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Thu, 1 Aug 2024 16:29:46 +0800
Subject: [PATCH 3/4] fixup! StructurizeCFG: Optimize phi insertion during ssa
 reconstruction

---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 74 ++++++++++---------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5d4969dea66bc..d36abbc877197 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -721,20 +721,21 @@ void StructurizeCFG::findUndefBlocks(
 bool StructurizeCFG::isCompatible(const BBValueVector &IncomingA,
                                   const BBValueVector &IncomingB,
                                   BBValueVector &Merged) {
-  MapVector<BasicBlock *, Value *> UnionSet;
+  DenseMap<BasicBlock *, Value *> UnionSet;
   for (auto &V : IncomingA)
     UnionSet.insert(V);
 
-  for (auto &V : IncomingB) {
-    if (UnionSet.contains(V.first) && UnionSet[V.first] != V.second)
+  for (auto &[BB, V] : IncomingB) {
+    if (UnionSet.contains(BB) && UnionSet[BB] != V)
       return false;
     // Either IncomingA does not have this value or IncomingA has the same
     // value.
-    UnionSet.insert(V);
+    UnionSet.insert({BB, V});
   }
 
   Merged.clear();
-  Merged.append(UnionSet.takeVector());
+  for (auto &[BB, V] : UnionSet)
+    Merged.push_back({BB, V});
   return true;
 }
 
@@ -743,7 +744,9 @@ void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
 
-  DenseMap<PHINode *, std::shared_ptr<BBValueVector>> MergedPHIMap;
+  SmallVector<BBValueVector> BBValuesPool;
+  // Map PHINode to the index of the merged incoming values in BBValuesPool
+  DenseMap<PHINode *, unsigned> MergedPHIMap;
   // Find out phi nodes that have compatible incoming values (either they have
   // the same value for the same block or one have undefined value, see example
   // below). We only search again the phi's that are referenced by another phi,
@@ -765,51 +768,55 @@ void StructurizeCFG::setPhiValues() {
   // This should be correct, because if a phi node does not have incoming
   // value from certain block, this means the block is not the predecessor
   // of the parent block, so we actually don't care its incoming value.
-  for (const auto &AddedPhi : AddedPhis) {
-    BasicBlock *To = AddedPhi.first;
+  for (const auto &[To, From] : AddedPhis) {
     if (!DeletedPhis.contains(To))
       continue;
     PhiMap &OldPhi = DeletedPhis[To];
-    for (const auto &PI : OldPhi) {
+    for (const auto &[Phi, Incomings] : OldPhi) {
       SmallVector<PHINode *> IncomingPHIs;
-      PHINode *Phi = PI.first;
-      for (const auto &VI : PI.second) {
+      for (const auto &[BB, V] : Incomings) {
         // First, for each phi, check whether it has incoming value which is
         // another phi.
-        if (PHINode *P = dyn_cast<PHINode>(VI.second))
+        if (PHINode *P = dyn_cast<PHINode>(V))
           IncomingPHIs.push_back(P);
       }
 
-      auto GetUpdatedIncoming = [&](PHINode *Phi) {
-        return MergedPHIMap.contains(Phi) ? *MergedPHIMap[Phi]
-                                          : DeletedPhis[Phi->getParent()][Phi];
+      const auto GetUpdatedIncoming = [&](PHINode *Phi) {
+        if (auto It = MergedPHIMap.find(Phi); It != MergedPHIMap.end())
+          return BBValuesPool[It->second];
+        return DeletedPhis[Phi->getParent()][Phi];
       };
+
       for (auto *OtherPhi : IncomingPHIs) {
-        // Skip phis that are not unrelated to the phi reconstruction for now.
+        // Skip phis that are unrelated to the phi reconstruction for now.
         if (!DeletedPhis.contains(OtherPhi->getParent()))
           continue;
 
-        // Skip phis that were already merged with others.
-        if (MergedPHIMap.contains(Phi) && MergedPHIMap.contains(OtherPhi))
+        auto PhiIt = MergedPHIMap.find(Phi);
+        auto OtherPhiIt = MergedPHIMap.find(OtherPhi);
+        // Skip phis that were both already merged with others.
+        if (PhiIt != MergedPHIMap.end() && OtherPhiIt != MergedPHIMap.end())
           continue;
 
-        std::shared_ptr<BBValueVector> MergedIncomings;
-        if (MergedPHIMap.contains(Phi))
-          MergedIncomings = MergedPHIMap[Phi];
-        else if (MergedPHIMap.contains(OtherPhi))
-          MergedIncomings = MergedPHIMap[OtherPhi];
-        else
-          MergedIncomings = std::make_shared<BBValueVector>();
+        unsigned PoolIndex;
+        if (PhiIt != MergedPHIMap.end()) {
+          PoolIndex = PhiIt->second;
+        } else if (OtherPhiIt != MergedPHIMap.end()) {
+          PoolIndex = OtherPhiIt->second;
+        } else {
+          PoolIndex = BBValuesPool.size();
+          BBValuesPool.push_back(BBValueVector());
+        }
 
         const auto &Incoming = GetUpdatedIncoming(Phi);
         const auto &OtherIncoming = GetUpdatedIncoming(OtherPhi);
-        if (isCompatible(Incoming, OtherIncoming, *MergedIncomings)) {
+        if (isCompatible(Incoming, OtherIncoming, BBValuesPool[PoolIndex])) {
           // union the incoming values
           if (!MergedPHIMap.contains(Phi))
-            MergedPHIMap.insert(std::pair(Phi, MergedIncomings));
+            MergedPHIMap.insert({Phi, PoolIndex});
 
           if (!MergedPHIMap.contains(OtherPhi))
-            MergedPHIMap.insert(std::pair(OtherPhi, MergedIncomings));
+            MergedPHIMap.insert({OtherPhi, PoolIndex});
         }
       }
     }
@@ -845,12 +852,13 @@ void StructurizeCFG::setPhiValues() {
       }
 
       // Use updated incoming vector.
+      auto PhiIt = MergedPHIMap.find(Phi);
       const auto &IncomingMap =
-          MergedPHIMap.contains(Phi) ? *MergedPHIMap[Phi] : PI.second;
-      for (const auto &VI : IncomingMap) {
-        Updater.AddAvailableValue(VI.first, VI.second);
-        if (isa<Constant>(VI.second))
-          ConstantPreds.push_back(VI.first);
+          PhiIt != MergedPHIMap.end() ? BBValuesPool[PhiIt->second] : PI.second;
+      for (const auto &[BB, V] : IncomingMap) {
+        Updater.AddAvailableValue(BB, V);
+        if (isa<Constant>(V))
+          ConstantPreds.push_back(BB);
       }
 
       for (auto UB : UndefBlks) {

>From e5b12bc4373e94df0241bbdfb06e9703d620542e Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Fri, 2 Aug 2024 18:43:07 +0800
Subject: [PATCH 4/4] fixup! StructurizeCFG: Optimize phi insertion during ssa
 reconstruction

---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 109 +++++++++---------
 1 file changed, 52 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index d36abbc877197..db0f86c27b33e 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -256,6 +256,10 @@ class StructurizeCFG {
   BBPhiMap DeletedPhis;
   BB2BBVecMap AddedPhis;
 
+  SmallVector<BBValueVector> BBValuesPool;
+  // Map PHINode to the index of the merged incoming values in BBValuesPool
+  DenseMap<PHINode *, unsigned> MergedPHIMap;
+
   PredMap Predicates;
   BranchVector Conditions;
 
@@ -289,8 +293,7 @@ class StructurizeCFG {
                        const SmallSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
 
-  bool isCompatible(const BBValueVector &IncomingA,
-                    const BBValueVector &IncomingB, BBValueVector &Merged);
+  void mergeIfCompatible(PHINode *A, PHINode *B);
 
   void setPhiValues();
 
@@ -714,29 +717,53 @@ void StructurizeCFG::findUndefBlocks(
   }
 }
 
-// Return true if two PHI nodes have compatible incoming values (for each
+// If two PHI nodes have compatible incoming values (for each
 // incoming block, either they have the same incoming value or only one PHI
-// node has a incoming value). And return the union of the incoming values
-// through \p Merged.
-bool StructurizeCFG::isCompatible(const BBValueVector &IncomingA,
-                                  const BBValueVector &IncomingB,
-                                  BBValueVector &Merged) {
-  DenseMap<BasicBlock *, Value *> UnionSet;
-  for (auto &V : IncomingA)
-    UnionSet.insert(V);
+// node has a incoming value), let them share the merged incoming values.
+void StructurizeCFG::mergeIfCompatible(PHINode *A, PHINode *B) {
+
+  auto ItA = MergedPHIMap.find(A);
+  auto ItB = MergedPHIMap.find(B);
+  bool FoundA = ItA != MergedPHIMap.end();
+  bool FoundB = ItB != MergedPHIMap.end();
+
+  // Skip phis that were both already merged with others.
+  if (FoundA && FoundB)
+    return;
+
+  const auto &IncomingA =
+      FoundA ? BBValuesPool[ItA->second] : DeletedPhis[A->getParent()][A];
+  const auto &IncomingB =
+      FoundB ? BBValuesPool[ItB->second] : DeletedPhis[B->getParent()][B];
 
+  DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
   for (auto &[BB, V] : IncomingB) {
-    if (UnionSet.contains(BB) && UnionSet[BB] != V)
-      return false;
+    if (Mergeable.contains(BB) && Mergeable[BB] != V)
+      return;
     // Either IncomingA does not have this value or IncomingA has the same
     // value.
-    UnionSet.insert({BB, V});
+    Mergeable.insert({BB, V});
   }
 
-  Merged.clear();
-  for (auto &[BB, V] : UnionSet)
-    Merged.push_back({BB, V});
-  return true;
+  unsigned PoolIndex;
+
+  if (FoundA || FoundB) {
+    PoolIndex = FoundA ? ItA->second : ItB->second;
+    BBValuesPool[PoolIndex].clear();
+    BBValuesPool[PoolIndex].append(Mergeable.begin(), Mergeable.end());
+  } else {
+    PoolIndex = BBValuesPool.size();
+    BBValuesPool.emplace_back(Mergeable.begin(), Mergeable.end());
+  }
+
+  // Skip insertion if Phi was already merged with other phi node.
+  if (!FoundA)
+    MergedPHIMap.insert({A, PoolIndex});
+
+  if (!FoundB)
+    MergedPHIMap.insert({B, PoolIndex});
+
+  return;
 }
 
 /// Add the real PHI value as soon as everything is set up
@@ -744,9 +771,6 @@ void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
 
-  SmallVector<BBValueVector> BBValuesPool;
-  // Map PHINode to the index of the merged incoming values in BBValuesPool
-  DenseMap<PHINode *, unsigned> MergedPHIMap;
   // Find out phi nodes that have compatible incoming values (either they have
   // the same value for the same block or one have undefined value, see example
   // below). We only search again the phi's that are referenced by another phi,
@@ -769,10 +793,11 @@ void StructurizeCFG::setPhiValues() {
   // value from certain block, this means the block is not the predecessor
   // of the parent block, so we actually don't care its incoming value.
   for (const auto &[To, From] : AddedPhis) {
-    if (!DeletedPhis.contains(To))
+    auto OldPhiIt = DeletedPhis.find(To);
+    if (OldPhiIt == DeletedPhis.end())
       continue;
-    PhiMap &OldPhi = DeletedPhis[To];
-    for (const auto &[Phi, Incomings] : OldPhi) {
+
+    for (const auto &[Phi, Incomings] : OldPhiIt->second) {
       SmallVector<PHINode *> IncomingPHIs;
       for (const auto &[BB, V] : Incomings) {
         // First, for each phi, check whether it has incoming value which is
@@ -781,43 +806,11 @@ void StructurizeCFG::setPhiValues() {
           IncomingPHIs.push_back(P);
       }
 
-      const auto GetUpdatedIncoming = [&](PHINode *Phi) {
-        if (auto It = MergedPHIMap.find(Phi); It != MergedPHIMap.end())
-          return BBValuesPool[It->second];
-        return DeletedPhis[Phi->getParent()][Phi];
-      };
-
       for (auto *OtherPhi : IncomingPHIs) {
         // Skip phis that are unrelated to the phi reconstruction for now.
         if (!DeletedPhis.contains(OtherPhi->getParent()))
           continue;
-
-        auto PhiIt = MergedPHIMap.find(Phi);
-        auto OtherPhiIt = MergedPHIMap.find(OtherPhi);
-        // Skip phis that were both already merged with others.
-        if (PhiIt != MergedPHIMap.end() && OtherPhiIt != MergedPHIMap.end())
-          continue;
-
-        unsigned PoolIndex;
-        if (PhiIt != MergedPHIMap.end()) {
-          PoolIndex = PhiIt->second;
-        } else if (OtherPhiIt != MergedPHIMap.end()) {
-          PoolIndex = OtherPhiIt->second;
-        } else {
-          PoolIndex = BBValuesPool.size();
-          BBValuesPool.push_back(BBValueVector());
-        }
-
-        const auto &Incoming = GetUpdatedIncoming(Phi);
-        const auto &OtherIncoming = GetUpdatedIncoming(OtherPhi);
-        if (isCompatible(Incoming, OtherIncoming, BBValuesPool[PoolIndex])) {
-          // union the incoming values
-          if (!MergedPHIMap.contains(Phi))
-            MergedPHIMap.insert({Phi, PoolIndex});
-
-          if (!MergedPHIMap.contains(OtherPhi))
-            MergedPHIMap.insert({OtherPhi, PoolIndex});
-        }
+        mergeIfCompatible(Phi, OtherPhi);
       }
     }
   }
@@ -1318,6 +1311,8 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   LoopConds.clear();
   FlowSet.clear();
   TermDL.clear();
+  BBValuesPool.clear();
+  MergedPHIMap.clear();
 
   return true;
 }