[llvm] [Utils][UnifyLoopExits] Avoid costly updates if nothing changed (PR #129179)

Johannes Doerfert via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 27 21:18:11 PST 2025


https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/129179

>From 4f77f557e50d53d976bcbc0888fbda06f1e10e19 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 9 Oct 2024 16:41:25 -0700
Subject: [PATCH] [Utils][UnifyLoopExits] Avoid costly updates if nothing
 changed

If the ControlFlowHub did not perform any change to the control flow,
there is no need to repair SSA, update the loop structure, and verify
a bunch of things. This is not completely NFC though, repairSSA
introduced PHI nodes with a single entry that are now missing.

My code went from 400+ seconds to 1 second, since no loop required the
exits to be unified, but there were many "complex" loops.
---
 .../llvm/Transforms/Utils/ControlFlowUtils.h  |  4 ++-
 .../lib/Transforms/Utils/ControlFlowUtils.cpp |  6 ++--
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp  |  6 +++-
 ...vergence-divergent-i1-used-outside-loop.ll |  6 ++--
 .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll    | 32 +++++++++----------
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |  2 ++
 6 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h
index f789c3af75c68..810fef29f4010 100644
--- a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h
@@ -110,7 +110,9 @@ struct ControlFlowHub {
     Branches.emplace_back(BB, Succ0, Succ1);
   }
 
-  BasicBlock *
+  /// Return the unified loop exit block and a flag indicating if the CFG was
+  /// changed at all.
+  std::pair<BasicBlock *, bool>
   finalize(DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
            const StringRef Prefix,
            std::optional<unsigned> MaxControlFlowBooleans = std::nullopt);
diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
index 5ba626fa213ad..0155a7ba2570b 100644
--- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
@@ -270,7 +270,7 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
   }
 }
 
-BasicBlock *ControlFlowHub::finalize(
+std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
     DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
     const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) {
 #ifndef NDEBUG
@@ -289,7 +289,7 @@ BasicBlock *ControlFlowHub::finalize(
   }
 
   if (Outgoing.size() < 2)
-    return Outgoing.front();
+    return {Outgoing.front(), false};
 
   SmallVector<DominatorTree::UpdateType, 16> Updates;
   if (DTU) {
@@ -338,5 +338,5 @@ BasicBlock *ControlFlowHub::finalize(
         Inst->eraseFromParent();
   }
 
-  return FirstGuardBlock;
+  return {FirstGuardBlock, true};
 }
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 856f3c3ed3e13..9f338dbc78cff 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -169,8 +169,12 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
 
   SmallVector<BasicBlock *, 8> GuardBlocks;
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  BasicBlock *LoopExitBlock = CHub.finalize(
+  BasicBlock *LoopExitBlock;
+  bool ChangedCFG;
+  std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
       &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+  if (!ChangedCFG)
+    return false;
 
   restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 91e16d91ddd15..a66c21feb1cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX10-NEXT:    s_branch .LBB4_2
 ; GFX10-NEXT:  .LBB4_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
@@ -312,6 +312,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
 ; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_4
@@ -328,11 +329,12 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
 ; GFX10-NEXT:    s_mov_b32 s7, -1
+; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
 ; GFX10-NEXT:  ; %bb.5: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v4
 ; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
 ; GFX10-NEXT:    s_and_b32 s7, exec_lo, 0
 ; GFX10-NEXT:    s_or_b32 s7, s4, s7
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 50b6ad9f0cb37..15f5b2f76dfc5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7646,9 +7646,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_2
-; GFX7-NEXT:  ; %bb.3: ; %Flow23
+; GFX7-NEXT:  ; %bb.3: ; %Flow22
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:  .LBB28_4: ; %Flow24
+; GFX7-NEXT:  .LBB28_4: ; %Flow23
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v1
@@ -7676,7 +7676,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_6
-; GFX7-NEXT:  .LBB28_7: ; %Flow22
+; GFX7-NEXT:  .LBB28_7: ; %Flow21
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -7725,7 +7725,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_11
 ; GFX7-NEXT:  ; %bb.12: ; %Flow
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX7-NEXT:  .LBB28_13: ; %Flow20
+; GFX7-NEXT:  .LBB28_13: ; %Flow19
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    v_readfirstlane_b32 s4, v2
@@ -7770,9 +7770,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_2
-; GFX6-NEXT:  ; %bb.3: ; %Flow21
+; GFX6-NEXT:  ; %bb.3: ; %Flow20
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:  .LBB28_4: ; %Flow22
+; GFX6-NEXT:  .LBB28_4: ; %Flow21
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
@@ -7800,7 +7800,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_6
-; GFX6-NEXT:  .LBB28_7: ; %Flow20
+; GFX6-NEXT:  .LBB28_7: ; %Flow19
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -7849,7 +7849,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_11
 ; GFX6-NEXT:  ; %bb.12: ; %Flow
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX6-NEXT:  .LBB28_13: ; %Flow18
+; GFX6-NEXT:  .LBB28_13: ; %Flow17
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
@@ -8483,9 +8483,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_2
-; GFX7-NEXT:  ; %bb.3: ; %Flow23
+; GFX7-NEXT:  ; %bb.3: ; %Flow22
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:  .LBB29_4: ; %Flow24
+; GFX7-NEXT:  .LBB29_4: ; %Flow23
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v1
@@ -8513,7 +8513,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_6
-; GFX7-NEXT:  .LBB29_7: ; %Flow22
+; GFX7-NEXT:  .LBB29_7: ; %Flow21
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -8562,7 +8562,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_11
 ; GFX7-NEXT:  ; %bb.12: ; %Flow
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX7-NEXT:  .LBB29_13: ; %Flow20
+; GFX7-NEXT:  .LBB29_13: ; %Flow19
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    v_readfirstlane_b32 s4, v2
@@ -8607,9 +8607,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_2
-; GFX6-NEXT:  ; %bb.3: ; %Flow21
+; GFX6-NEXT:  ; %bb.3: ; %Flow20
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:  .LBB29_4: ; %Flow22
+; GFX6-NEXT:  .LBB29_4: ; %Flow21
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
@@ -8637,7 +8637,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_6
-; GFX6-NEXT:  .LBB29_7: ; %Flow20
+; GFX6-NEXT:  .LBB29_7: ; %Flow19
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -8686,7 +8686,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_11
 ; GFX6-NEXT:  ; %bb.12: ; %Flow
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX6-NEXT:  .LBB29_13: ; %Flow18
+; GFX6-NEXT:  .LBB29_13: ; %Flow17
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index a9b8663a48dea..dad59daaefb5f 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -39,6 +39,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX942-NEXT:  ; %bb.3:
 ; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:    ; implicit-def: $agpr0
 ; GFX942-NEXT:  .LBB0_4: ; %common.ret
 ; GFX942-NEXT:    s_endpgm
 ;
@@ -79,6 +80,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX908-NEXT:  ; %bb.3:
 ; GFX908-NEXT:    ; implicit-def: $sgpr3
+; GFX908-NEXT:    ; implicit-def: $agpr0
 ; GFX908-NEXT:  .LBB0_4: ; %common.ret
 ; GFX908-NEXT:    s_endpgm
 entry:



More information about the llvm-commits mailing list