[llvm] [StructurizeCFG] Order IF Else block using Heuristics (PR #139605)

Vigneshwar Jayakumar via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 10 14:59:16 PDT 2025


https://github.com/VigneshwarJ updated https://github.com/llvm/llvm-project/pull/139605

>From d7da7dd35211a3a4d94ed657ac64cfd682aefe15 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Mon, 12 May 2025 13:15:28 -0500
Subject: [PATCH 1/5] [StructurizeCFG] Order IF Else block using Heuristics

Then and Else block order in SCC is arbitrary. But based on the
order, after structurization there are cases where there might be
extra VGPR copies due to interference during register coelescing.

This patch introduces heuristics to order the then and else block
based on the potential VGPR copies to maximize coelescing.
---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp |  80 +++++++++++
 .../StructurizeCFG/order-if-else.ll           | 129 ++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 llvm/test/Transforms/StructurizeCFG/order-if-else.ll

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index eb22b50532695..ec54f53d6165b 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -307,6 +307,8 @@ class StructurizeCFG {
 
   RegionNode *PrevNode;
 
+  void reorderIfElseBlock(BasicBlock *BB, unsigned Idx);
+
   void orderNodes();
 
   void analyzeLoops(RegionNode *N);
@@ -409,6 +411,31 @@ class StructurizeCFGLegacyPass : public RegionPass {
 
 } // end anonymous namespace
 
+/// Helper function for heuristics to order if else block
+/// Checks whether an instruction is potential vector copy instruction, if so,
+/// checks if the operands are from different BB. if so, returns True.
+// Then there's a possibility of coelescing without interference when ordered
+// first.
+static bool hasAffectingInstructions(Instruction *I, BasicBlock *BB) {
+
+  if (!I || I->getParent() != BB)
+    return true;
+
+  // If the instruction is not a poterntial copy instructoin, return true.
+  if (!isa<ExtractElementInst>(*I) && !isa<ExtractValueInst>(*I))
+    return false;
+
+  // Check if any operands are instructions defined in the same block.
+  for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
+    if (auto *OpI = dyn_cast<Instruction>(I->getOperand(i))) {
+      if (OpI->getParent() == BB)
+        return false;
+    }
+  }
+
+  return true;
+}
+
 char StructurizeCFGLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
@@ -419,6 +446,58 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
                     "Structurize the CFG", false, false)
 
+/// Then and Else block order in SCC is arbitrary. But based on the
+/// order, after structurization there are cases where there might be extra 
+/// VGPR copies due to interference during register coelescing.
+///  eg:- incoming phi values from Else block contains only vgpr copies and
+///  incoming phis in Then block has are some modification for the vgprs.
+/// after structurization, there would be interference when coelesing when Then
+/// block is ordered first. But those copies can be coelesced when Else is
+/// ordered first.
+///
+/// This function checks the incoming phi values in the merge block and
+/// orders based on the following heuristics  of Then and Else block. Checks
+/// whether an incoming phi can be potential copy instructions and if so 
+/// checks whether copy within the block or not. 
+/// Increases score if its a potential copy from outside the block. 
+/// the higher scored block is ordered first.
+void StructurizeCFG::reorderIfElseBlock(BasicBlock *BB, unsigned Idx) {
+  BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+
+  if (Term && Term->isConditional()) {
+    BasicBlock *ThenBB = Term->getSuccessor(0);
+    BasicBlock *ElseBB = Term->getSuccessor(1);
+    BasicBlock *ThenSucc = ThenBB->getSingleSuccessor();
+
+    if (BB == ThenBB->getSinglePredecessor() &&
+        (ThenBB->getSinglePredecessor() == ElseBB->getSinglePredecessor()) &&
+        (ThenSucc && ThenSucc == ElseBB->getSingleSuccessor())) {
+      unsigned ThenScore = 0, ElseScore = 0;
+
+      for (PHINode &Phi : ThenSucc->phis()) {
+        Value *ThenVal = Phi.getIncomingValueForBlock(ThenBB);
+        Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
+
+        if (auto *Inst = dyn_cast<Instruction>(ThenVal))
+          ThenScore += hasAffectingInstructions(Inst, ThenBB);
+        if (auto *Inst = dyn_cast<Instruction>(ElseVal))
+          ElseScore += hasAffectingInstructions(Inst, ElseBB);
+      }
+
+      if (ThenScore != ElseScore) {
+        if (ThenScore < ElseScore)
+          std::swap(ThenBB, ElseBB);
+
+        // reorder the last two inserted elements in Order
+        if (Idx >= 2 && Order[Idx - 1]->getEntry() == ElseBB &&
+            Order[Idx - 2]->getEntry() == ThenBB) {
+          std::swap(Order[Idx - 1], Order[Idx - 2]);
+        }
+      }
+    }
+  }
+}
+
 /// Build up the general order of nodes, by performing a topological sort of the
 /// parent region's nodes, while ensuring that there is no outer cycle node
 /// between any two inner cycle nodes.
@@ -452,6 +531,7 @@ void StructurizeCFG::orderNodes() {
       // Add the SCC nodes to the Order array.
       for (const auto &N : SCC) {
         assert(I < E && "SCC size mismatch!");
+        reorderIfElseBlock(N.first->getEntry(), I);
         Order[I++] = N.first;
       }
     }
diff --git a/llvm/test/Transforms/StructurizeCFG/order-if-else.ll b/llvm/test/Transforms/StructurizeCFG/order-if-else.ll
new file mode 100644
index 0000000000000..02641f405f3b4
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/order-if-else.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
+
+define amdgpu_kernel void @test_extractelement_1(<4 x i32> %vec, i1 %cond, ptr %ptr) {
+; CHECK-LABEL: define amdgpu_kernel void @test_extractelement_1(
+; CHECK-SAME: <4 x i32> [[VEC:%.*]], i1 [[COND:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    br i1 [[COND_INV]], label %[[ELSE:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[A:%.*]], %[[ELSE]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[A]] = extractelement <4 x i32> [[VEC]], i32 1
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[Z]], %[[THEN]] ]
+; CHECK-NEXT:    store i32 [[PHI]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %then, label %else
+
+then:
+  %x = extractelement <4 x i32> %vec, i32 0
+  %z = add i32 %x, 1
+  br label %merge
+
+else:
+  %a = extractelement <4 x i32> %vec, i32 1
+  br label %merge
+
+merge:
+  %phi = phi i32 [ %z, %then ], [ %a, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define amdgpu_kernel void @test_extractelement_2(<4 x i32> %vec, i1 %cond, ptr %ptr) {
+; CHECK-LABEL: define amdgpu_kernel void @test_extractelement_2(
+; CHECK-SAME: <4 x i32> [[VEC:%.*]], i1 [[COND:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    br i1 [[COND_INV]], label %[[ELSE:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[A:%.*]], %[[ELSE]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 1
+; CHECK-NEXT:    [[Y:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[Z:%.*]] = extractelement <4 x i32> [[VEC1]], i32 0
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[A]] = extractelement <4 x i32> [[VEC]], i32 1
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[Z]], %[[THEN]] ]
+; CHECK-NEXT:    store i32 [[PHI]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %then, label %else
+
+then:
+  %x = extractelement <4 x i32> %vec, i32 1
+  %y = add i32 %x, 1
+  %vec1 =  insertelement <4 x i32>  poison, i32 %y,  i32 0
+  %z = extractelement <4 x i32> %vec1, i32 0
+  br label %merge
+
+else:
+  %a = extractelement <4 x i32> %vec, i32 1
+  br label %merge
+
+merge:
+  %phi = phi i32 [ %z, %then ], [ %a, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+%pair = type { i32, i32 }
+define amdgpu_kernel void @test_extractvalue(ptr %ptr, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test_extractvalue(
+; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[FLOW:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[A_THEN]], %[[THEN]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[THEN]] ], [ true, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ELSE:.*]], label %[[MERGE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
+; CHECK-NEXT:    [[SUM_ELSE:%.*]] = add i32 [[A_ELSE]], 1
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[SUM_ELSE]], %[[ELSE]] ]
+; CHECK-NEXT:    store i32 [[PHI]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}

>From 95c47d22998030dc79e9f0867160111a557d6d28 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Mon, 12 May 2025 13:52:53 -0500
Subject: [PATCH 2/5] format correction

---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index ec54f53d6165b..af9efe9f4d160 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -447,7 +447,7 @@ INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
                     "Structurize the CFG", false, false)
 
 /// Then and Else block order in SCC is arbitrary. But based on the
-/// order, after structurization there are cases where there might be extra 
+/// order, after structurization there are cases where there might be extra
 /// VGPR copies due to interference during register coelescing.
 ///  eg:- incoming phi values from Else block contains only vgpr copies and
 ///  incoming phis in Then block has are some modification for the vgprs.
@@ -457,7 +457,7 @@ INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
 ///
 /// This function checks the incoming phi values in the merge block and
 /// orders based on the following heuristics  of Then and Else block. Checks
-/// whether an incoming phi can be potential copy instructions and if so 
+/// whether an incoming phi can be potential copy instructions and if so
 /// checks whether copy within the block or not. 
 /// Increases score if its a potential copy from outside the block. 
 /// the higher scored block is ordered first.

>From e7c1f9c3a6eac55e09a1dd19ddc48b955f164c7d Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Mon, 12 May 2025 16:39:00 -0500
Subject: [PATCH 3/5] review changes

---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 71 ++++++++++---------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index af9efe9f4d160..d39d66f950aaf 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -414,14 +414,14 @@ class StructurizeCFGLegacyPass : public RegionPass {
 /// Helper function for heuristics to order if else block
 /// Checks whether an instruction is potential vector copy instruction, if so,
 /// checks if the operands are from different BB. if so, returns True.
-// Then there's a possibility of coelescing without interference when ordered
+// Then there's a possibility of coalescing without interference when ordered
 // first.
 static bool hasAffectingInstructions(Instruction *I, BasicBlock *BB) {
 
-  if (!I || I->getParent() != BB)
+  if (I->getParent() != BB)
     return true;
 
-  // If the instruction is not a poterntial copy instructoin, return true.
+  // If the instruction is not a poterntial copy instruction, return true.
   if (!isa<ExtractElementInst>(*I) && !isa<ExtractValueInst>(*I))
     return false;
 
@@ -448,53 +448,56 @@ INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
 
 /// Then and Else block order in SCC is arbitrary. But based on the
 /// order, after structurization there are cases where there might be extra
-/// VGPR copies due to interference during register coelescing.
+/// VGPR copies due to interference during register coalescing.
 ///  eg:- incoming phi values from Else block contains only vgpr copies and
 ///  incoming phis in Then block has are some modification for the vgprs.
-/// after structurization, there would be interference when coelesing when Then
-/// block is ordered first. But those copies can be coelesced when Else is
+/// after structurization, there would be interference when coalesing when Then
+/// block is ordered first. But those copies can be coalesced when Else is
 /// ordered first.
 ///
 /// This function checks the incoming phi values in the merge block and
 /// orders based on the following heuristics  of Then and Else block. Checks
 /// whether an incoming phi can be potential copy instructions and if so
-/// checks whether copy within the block or not. 
-/// Increases score if its a potential copy from outside the block. 
+/// checks whether copy within the block or not.
+/// Increases score if its a potential copy from outside the block.
 /// the higher scored block is ordered first.
 void StructurizeCFG::reorderIfElseBlock(BasicBlock *BB, unsigned Idx) {
   BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
 
-  if (Term && Term->isConditional()) {
-    BasicBlock *ThenBB = Term->getSuccessor(0);
-    BasicBlock *ElseBB = Term->getSuccessor(1);
-    BasicBlock *ThenSucc = ThenBB->getSingleSuccessor();
+  if (!Term || !(Term->isConditional()))
+    return;
 
-    if (BB == ThenBB->getSinglePredecessor() &&
-        (ThenBB->getSinglePredecessor() == ElseBB->getSinglePredecessor()) &&
-        (ThenSucc && ThenSucc == ElseBB->getSingleSuccessor())) {
-      unsigned ThenScore = 0, ElseScore = 0;
+  BasicBlock *ThenBB = Term->getSuccessor(0);
+  BasicBlock *ElseBB = Term->getSuccessor(1);
+  BasicBlock *ThenSucc = ThenBB->getSingleSuccessor();
 
-      for (PHINode &Phi : ThenSucc->phis()) {
-        Value *ThenVal = Phi.getIncomingValueForBlock(ThenBB);
-        Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
+  if (BB != ThenBB->getSinglePredecessor() || !ThenSucc ||
+      (ThenBB->getSinglePredecessor() != ElseBB->getSinglePredecessor()) ||
+      ThenSucc != ElseBB->getSingleSuccessor())
+    return;
 
-        if (auto *Inst = dyn_cast<Instruction>(ThenVal))
-          ThenScore += hasAffectingInstructions(Inst, ThenBB);
-        if (auto *Inst = dyn_cast<Instruction>(ElseVal))
-          ElseScore += hasAffectingInstructions(Inst, ElseBB);
-      }
+  unsigned ThenScore = 0, ElseScore = 0;
 
-      if (ThenScore != ElseScore) {
-        if (ThenScore < ElseScore)
-          std::swap(ThenBB, ElseBB);
+  for (PHINode &Phi : ThenSucc->phis()) {
+    Value *ThenVal = Phi.getIncomingValueForBlock(ThenBB);
+    Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
 
-        // reorder the last two inserted elements in Order
-        if (Idx >= 2 && Order[Idx - 1]->getEntry() == ElseBB &&
-            Order[Idx - 2]->getEntry() == ThenBB) {
-          std::swap(Order[Idx - 1], Order[Idx - 2]);
-        }
-      }
-    }
+    if (auto *Inst = dyn_cast<Instruction>(ThenVal))
+      ThenScore += hasAffectingInstructions(Inst, ThenBB);
+    if (auto *Inst = dyn_cast<Instruction>(ElseVal))
+      ElseScore += hasAffectingInstructions(Inst, ElseBB);
+  }
+
+  if (ThenScore == ElseScore)
+    return;
+
+  if (ThenScore < ElseScore)
+    std::swap(ThenBB, ElseBB);
+
+  // reorder the last two inserted elements in Order
+  if (Idx >= 2 && Order[Idx - 1]->getEntry() == ElseBB &&
+      Order[Idx - 2]->getEntry() == ThenBB) {
+    std::swap(Order[Idx - 1], Order[Idx - 2]);
   }
 }
 

>From 05c24b8c76aef230228ca2065e10da6cf65fa13f Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Mon, 12 May 2025 17:30:13 -0500
Subject: [PATCH 4/5] update tests

---
 llvm/test/CodeGen/AMDGPU/if-else.ll           | 166 ++++++++++++++++++
 .../StructurizeCFG/order-if-else.ll           |  12 +-
 2 files changed, 172 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/if-else.ll

diff --git a/llvm/test/CodeGen/AMDGPU/if-else.ll b/llvm/test/CodeGen/AMDGPU/if-else.ll
new file mode 100644
index 0000000000000..67907b120b362
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/if-else.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX900 %s
+
+define amdgpu_kernel void @test_extractelement_then_else(<4 x i32> %vec, i1 %cond, ptr %ptr) {
+; GFX900-LABEL: test_extractelement_then_else:
+; GFX900:       ; %bb.0: ; %if
+; GFX900-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_bitcmp0_b32 s6, 0
+; GFX900-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX900-NEXT:  ; %bb.1: ; %else
+; GFX900-NEXT:    s_cbranch_execz .LBB0_3
+; GFX900-NEXT:    s_branch .LBB0_4
+; GFX900-NEXT:  .LBB0_2:
+; GFX900-NEXT:  .LBB0_3: ; %then
+; GFX900-NEXT:    s_add_i32 s1, s1, 1
+; GFX900-NEXT:  .LBB0_4: ; %merge
+; GFX900-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x3c
+; GFX900-NEXT:    v_mov_b32_e32 v2, s1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s2
+; GFX900-NEXT:    v_mov_b32_e32 v1, s3
+; GFX900-NEXT:    flat_store_dword v[0:1], v2
+; GFX900-NEXT:    s_endpgm
+if:
+  br i1 %cond, label %then, label %else
+
+then:
+  %x = extractelement <4 x i32> %vec, i32 1
+  %y = add i32 %x, 1
+  %vec1 =  insertelement <4 x i32>  poison, i32 %y,  i32 0
+  %z = extractelement <4 x i32> %vec1, i32 0
+  br label %merge
+
+else:
+  %a = extractelement <4 x i32> %vec, i32 1
+  br label %merge
+
+merge:
+  %phi = phi i32 [ %z, %then ], [ %a, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define amdgpu_kernel void @test_extractelement_else_then(<4 x i32> %vec, i1 %cond, ptr %ptr) {
+; GFX900-LABEL: test_extractelement_else_then:
+; GFX900:       ; %bb.0: ; %if
+; GFX900-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_bitcmp0_b32 s6, 0
+; GFX900-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX900-NEXT:  ; %bb.1: ; %else
+; GFX900-NEXT:    s_cbranch_execz .LBB1_3
+; GFX900-NEXT:    s_branch .LBB1_4
+; GFX900-NEXT:  .LBB1_2:
+; GFX900-NEXT:  .LBB1_3: ; %then
+; GFX900-NEXT:    s_add_i32 s1, s1, 1
+; GFX900-NEXT:  .LBB1_4: ; %merge
+; GFX900-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x3c
+; GFX900-NEXT:    v_mov_b32_e32 v2, s1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s2
+; GFX900-NEXT:    v_mov_b32_e32 v1, s3
+; GFX900-NEXT:    flat_store_dword v[0:1], v2
+; GFX900-NEXT:    s_endpgm
+if:
+  br i1 %cond, label %else, label %then
+
+else:
+  %a = extractelement <4 x i32> %vec, i32 1
+  br label %merge
+
+then:
+  %x = extractelement <4 x i32> %vec, i32 1
+  %y = add i32 %x, 1
+  %vec1 =  insertelement <4 x i32>  poison, i32 %y,  i32 0
+  %z = extractelement <4 x i32> %vec1, i32 0
+  br label %merge
+
+merge:
+  %phi = phi i32 [ %z, %then ], [ %a, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+%pair = type { i32, i32 }
+
+define void @test_extractvalue_then_else(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_then_else:
+; GFX900:       ; %bb.0: ; %if
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_dword v3, v[0:1]
+; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT:    s_cbranch_execz .LBB2_2
+; GFX900-NEXT:  ; %bb.1: ; %else
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT:  .LBB2_2: ; %merge
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_store_dword v[0:1], v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+if:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define void @test_extractvalue_else_then(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_else_then:
+; GFX900:       ; %bb.0: ; %if
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_dword v3, v[0:1]
+; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT:    s_cbranch_execz .LBB3_2
+; GFX900-NEXT:  ; %bb.1: ; %else
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT:  .LBB3_2: ; %merge
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_store_dword v[0:1], v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+if:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %else, label %then
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
diff --git a/llvm/test/Transforms/StructurizeCFG/order-if-else.ll b/llvm/test/Transforms/StructurizeCFG/order-if-else.ll
index 02641f405f3b4..cfcf8e2e24e37 100644
--- a/llvm/test/Transforms/StructurizeCFG/order-if-else.ll
+++ b/llvm/test/Transforms/StructurizeCFG/order-if-else.ll
@@ -2,8 +2,8 @@
 ; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
 ; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
 
-define amdgpu_kernel void @test_extractelement_1(<4 x i32> %vec, i1 %cond, ptr %ptr) {
-; CHECK-LABEL: define amdgpu_kernel void @test_extractelement_1(
+define void @test_extractelement_1(<4 x i32> %vec, i1 %cond, ptr %ptr) {
+; CHECK-LABEL: define void @test_extractelement_1(
 ; CHECK-SAME: <4 x i32> [[VEC:%.*]], i1 [[COND:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
@@ -42,8 +42,8 @@ merge:
   ret void
 }
 
-define amdgpu_kernel void @test_extractelement_2(<4 x i32> %vec, i1 %cond, ptr %ptr) {
-; CHECK-LABEL: define amdgpu_kernel void @test_extractelement_2(
+define void @test_extractelement_2(<4 x i32> %vec, i1 %cond, ptr %ptr) {
+; CHECK-LABEL: define void @test_extractelement_2(
 ; CHECK-SAME: <4 x i32> [[VEC:%.*]], i1 [[COND:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
@@ -87,8 +87,8 @@ merge:
 }
 
 %pair = type { i32, i32 }
-define amdgpu_kernel void @test_extractvalue(ptr %ptr, i1 %cond) {
-; CHECK-LABEL: define amdgpu_kernel void @test_extractvalue(
+define void @test_extractvalue(ptr %ptr, i1 %cond) {
+; CHECK-LABEL: define void @test_extractvalue(
 ; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4

>From d279104c1c21cd0dd6883cb28304c747fa871b7a Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Tue, 10 Jun 2025 16:58:44 -0500
Subject: [PATCH 5/5] zero cost instruction

---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 50 ++++++++++++-------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index d39d66f950aaf..023cfdcfd5d34 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -287,6 +288,7 @@ class StructurizeCFG {
 
   UniformityInfo *UA = nullptr;
   DominatorTree *DT;
+  TargetTransformInfo *TTI;
 
   SmallVector<RegionNode *, 8> Order;
   BBSet Visited;
@@ -367,7 +369,7 @@ class StructurizeCFG {
 
 public:
   void init(Region *R);
-  bool run(Region *R, DominatorTree *DT);
+  bool run(Region *R, DominatorTree *DT, TargetTransformInfo *TTI);
   bool makeUniformRegion(Region *R, UniformityInfo &UA);
 };
 
@@ -393,8 +395,11 @@ class StructurizeCFGLegacyPass : public RegionPass {
       if (SCFG.makeUniformRegion(R, UA))
         return false;
     }
+    Function *F = R->getEntry()->getParent();
+    TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    return SCFG.run(R, DT);
+    return SCFG.run(R, DT, TTI);
   }
 
   StringRef getPassName() const override { return "Structurize control flow"; }
@@ -402,6 +407,7 @@ class StructurizeCFGLegacyPass : public RegionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     if (SkipUniformRegions)
       AU.addRequired<UniformityInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
@@ -411,18 +417,23 @@ class StructurizeCFGLegacyPass : public RegionPass {
 
 } // end anonymous namespace
 
-/// Helper function for heuristics to order if else block
-/// Checks whether an instruction is potential vector copy instruction, if so,
-/// checks if the operands are from different BB. if so, returns True.
-// Then there's a possibility of coalescing without interference when ordered
-// first.
-static bool hasAffectingInstructions(Instruction *I, BasicBlock *BB) {
+/// Helper function for heuristics to order if else block.
+/// Checks whether an instruction is zero cost instruction and checks if the
+/// operands are from different BB. If so, this instruction can be coalesced
+/// when this block is ordered first. So, this returns true.
+static bool hasAffectingInstructions(Instruction *I, BasicBlock *BB,
+                                     TargetTransformInfo *TTI) {
 
   if (I->getParent() != BB)
     return true;
 
-  // If the instruction is not a poterntial copy instruction, return true.
-  if (!isa<ExtractElementInst>(*I) && !isa<ExtractValueInst>(*I))
+  // If the instruction is not a zero cost instruction, return false.
+  auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_CodeSize);
+  InstructionCost::CostType CostVal =
+      Cost.isValid()
+          ? Cost.getValue()
+          : (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
+  if (CostVal != 0)
     return false;
 
   // Check if any operands are instructions defined in the same block.
@@ -457,9 +468,9 @@ INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
 ///
 /// This function checks the incoming phi values in the merge block and
 /// orders based on the following heuristics  of Then and Else block. Checks
-/// whether an incoming phi can be potential copy instructions and if so
-/// checks whether copy within the block or not.
-/// Increases score if its a potential copy from outside the block.
+/// whether an incoming phi is a zero cost instructions and if so
+/// checks whether operands are within the block or not.
+/// Increases score if its a operands from outside the block.
 /// the higher scored block is ordered first.
 void StructurizeCFG::reorderIfElseBlock(BasicBlock *BB, unsigned Idx) {
   BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
@@ -483,9 +494,9 @@ void StructurizeCFG::reorderIfElseBlock(BasicBlock *BB, unsigned Idx) {
     Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
 
     if (auto *Inst = dyn_cast<Instruction>(ThenVal))
-      ThenScore += hasAffectingInstructions(Inst, ThenBB);
+      ThenScore += hasAffectingInstructions(Inst, ThenBB, TTI);
     if (auto *Inst = dyn_cast<Instruction>(ElseVal))
-      ElseScore += hasAffectingInstructions(Inst, ElseBB);
+      ElseScore += hasAffectingInstructions(Inst, ElseBB, TTI);
   }
 
   if (ThenScore == ElseScore)
@@ -1390,12 +1401,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 }
 
 /// Run the transformation for each region found
-bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
+bool StructurizeCFG::run(Region *R, DominatorTree *DT,
+                         TargetTransformInfo *TTI) {
   if (R->isTopLevelRegion())
     return false;
 
   this->DT = DT;
-
+  this->TTI = TTI;
   Func = R->getEntry()->getParent();
   assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
@@ -1457,7 +1469,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
   bool Changed = false;
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto &RI = AM.getResult<RegionInfoAnalysis>(F);
-
+  TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
   UniformityInfo *UI = nullptr;
   if (SkipUniformRegions)
     UI = &AM.getResult<UniformityInfoAnalysis>(F);
@@ -1476,7 +1488,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
       continue;
     }
 
-    Changed |= SCFG.run(R, DT);
+    Changed |= SCFG.run(R, DT, TTI);
   }
   if (!Changed)
     return PreservedAnalyses::all();



More information about the llvm-commits mailing list