[llvm] [CodeGen] Ignore requiresStructuredCFG check in canSplitCriticalEdge if successor is loop header (PR #154063)

Thu Sep 25 16:51:30 PDT 2025

https://github.com/wenju-he updated https://github.com/llvm/llvm-project/pull/154063

>From 1c2a9e2b97910e1d59852999ae5b2196d1976ebe Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Mon, 18 Aug 2025 06:37:03 +0200
Subject: [PATCH 1/9] [CodeGen] Ignore requiresStructuredCFG check in
 canSplitCriticalEdge if successor is loop header

This addresses a performance issue for our downstream GPU target that
sets requiresStructuredCFG to true. The issue is that EarlyMachineLICM
pass does not hoist loop invariants because a critical edge is not split.
---
 llvm/include/llvm/CodeGen/MachineBasicBlock.h |  4 +++-
 llvm/lib/CodeGen/MachineBasicBlock.cpp        | 15 ++++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 9e3d9196cc184..78e5dd99eab06 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -1035,7 +1035,9 @@ class MachineBasicBlock
   /// Succ, can be split. If this returns true a subsequent call to
   /// SplitCriticalEdge is guaranteed to return a valid basic block if
   /// no changes occurred in the meantime.
-  LLVM_ABI bool canSplitCriticalEdge(const MachineBasicBlock *Succ) const;
+  LLVM_ABI bool
+  canSplitCriticalEdge(const MachineBasicBlock *Succ,
+                       const SplitCriticalEdgeAnalyses &Analyses = {}) const;
 
   void pop_front() { Insts.pop_front(); }
   void pop_back() { Insts.pop_back(); }
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index c3c5a0f5102d7..82c852bb0f93b 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1160,7 +1160,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
 MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
     MachineBasicBlock *Succ, const SplitCriticalEdgeAnalyses &Analyses,
     std::vector<SparseBitVector<>> *LiveInSets, MachineDomTreeUpdater *MDTU) {
-  if (!canSplitCriticalEdge(Succ))
+  if (!canSplitCriticalEdge(Succ, Analyses))
     return nullptr;
 
   MachineFunction *MF = getParent();
@@ -1389,7 +1389,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
 }
 
 bool MachineBasicBlock::canSplitCriticalEdge(
-    const MachineBasicBlock *Succ) const {
+    const MachineBasicBlock *Succ,
+    const SplitCriticalEdgeAnalyses &Analyses) const {
   // Splitting the critical edge to a landing pad block is non-trivial. Don't do
   // it in this generic function.
   if (Succ->isEHPad())
@@ -1403,7 +1404,15 @@ bool MachineBasicBlock::canSplitCriticalEdge(
   const MachineFunction *MF = getParent();
   // Performance might be harmed on HW that implements branching using exec mask
   // where both sides of the branches are always executed.
-  if (MF->getTarget().requiresStructuredCFG())
+  // However, if `Succ` is a loop header, splitting the critical edge will not
+  // break structured CFG.
+  auto SuccIsLoopHeader = [&]() {
+    if (MachineLoopInfo *MLI = Analyses.MLI)
+      if (MachineLoop *L = MLI->getLoopFor(Succ); L && L->getHeader() == Succ)
+        return true;
+    return false;
+  };
+  if (MF->getTarget().requiresStructuredCFG() && !SuccIsLoopHeader())
     return false;
 
   // Do we have an Indirect jump with a jumptable that we can rewrite?

>From 2a193599a5d8c0d506ef53a3d8aebc2c10ac9e00 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Mon, 1 Sep 2025 08:19:52 +0200
Subject: [PATCH 2/9] pass MLI as new arg

---
 llvm/include/llvm/CodeGen/MachineBasicBlock.h | 2 +-
 llvm/lib/CodeGen/MachineBasicBlock.cpp        | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 78e5dd99eab06..7df34a76912dd 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -1037,7 +1037,7 @@ class MachineBasicBlock
   /// no changes occurred in the meantime.
   LLVM_ABI bool
   canSplitCriticalEdge(const MachineBasicBlock *Succ,
-                       const SplitCriticalEdgeAnalyses &Analyses = {}) const;
+                       const MachineLoopInfo *MLI = nullptr) const;
 
   void pop_front() { Insts.pop_front(); }
   void pop_back() { Insts.pop_back(); }
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 82c852bb0f93b..8c795f812df09 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1160,7 +1160,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
 MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
     MachineBasicBlock *Succ, const SplitCriticalEdgeAnalyses &Analyses,
     std::vector<SparseBitVector<>> *LiveInSets, MachineDomTreeUpdater *MDTU) {
-  if (!canSplitCriticalEdge(Succ, Analyses))
+  if (!canSplitCriticalEdge(Succ, Analyses.MLI))
     return nullptr;
 
   MachineFunction *MF = getParent();
@@ -1388,9 +1388,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
   return NMBB;
 }
 
-bool MachineBasicBlock::canSplitCriticalEdge(
-    const MachineBasicBlock *Succ,
-    const SplitCriticalEdgeAnalyses &Analyses) const {
+bool MachineBasicBlock::canSplitCriticalEdge(const MachineBasicBlock *Succ,
+                                             const MachineLoopInfo *MLI) const {
   // Splitting the critical edge to a landing pad block is non-trivial. Don't do
   // it in this generic function.
   if (Succ->isEHPad())
@@ -1407,7 +1406,7 @@ bool MachineBasicBlock::canSplitCriticalEdge(
   // However, if `Succ` is a loop header, splitting the critical edge will not
   // break structured CFG.
   auto SuccIsLoopHeader = [&]() {
-    if (MachineLoopInfo *MLI = Analyses.MLI)
+    if (MLI)
       if (MachineLoop *L = MLI->getLoopFor(Succ); L && L->getHeader() == Succ)
         return true;
     return false;

>From f9217abe32b0bb5e21aff8b9c22916bc0008c820 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Mon, 1 Sep 2025 10:18:04 +0200
Subject: [PATCH 3/9] add nvptx test

---
 .../NVPTX/machinelicm-no-preheader.mir        | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir

diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
new file mode 100644
index 0000000000000..f2f0ffdec8094
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
@@ -0,0 +1,72 @@
+# RUN: llc -mtriple=nvptx64 -mcpu=sm_20 -run-pass=early-machinelicm %s -verify-machineinstrs -o - | FileCheck %s
+
+# This test checks that the early-machineLICM pass successfully creates a new
+# loop preheader by splitting the critical edge and hoisting the loop invariant
+# value `%18` to the preheader.
+# Since the critical edge successor is a loop header, the splitting does not
+# break the structured CFG, which is a requirement for the NVPTX target.
+
+---
+name:            test_hoist
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 1, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 4, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 5, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 6, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 7, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 10, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 12, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 14, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 16, class: b1, preferred-register: '', flags: [  ] }
+  - { id: 17, class: b1, preferred-register: '', flags: [  ] }
+  - { id: 18, class: b32, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+
+    %8:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_2, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+    %7:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+    %9:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+    %10:b32 = INT_PTX_SREG_CTAID_x
+    %11:b32 = INT_PTX_SREG_NTID_x
+    %12:b32 = INT_PTX_SREG_TID_x
+    %13:b64 = CVT_u64_u32 killed %12, 0
+    %14:b64 = nuw MAD_WIDE_U32rrr killed %11, killed %10, killed %13
+    %15:b64 = nuw nsw SHL64_ri killed %14, 2
+    %0:b64 = nuw ADD64rr killed %9, killed %15
+    %1:b32 = LD_i32 0, 0, 1, 3, 32, %0, 0
+    %16:b1 = SETP_i32ri %8, 0, 0
+    CBranch killed %16, %bb.2
+    GOTO %bb.1
+
+  ; CHECK: bb.3:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   %18:b32 = ADD32ri %7, -1
+  ; CHECK: bb.1:
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %2:b32 = PHI %8, %bb.0, %5, %bb.1
+    %3:b32 = PHI %1, %bb.0, %4, %bb.1
+    %18:b32 = ADD32ri %7, -1
+    %4:b32 = SREM32rr %3, %18
+    %5:b32 = ADD32ri %2, -1
+    %17:b1 = SETP_i32ri %5, 0, 1
+    CBranch killed %17, %bb.1
+    GOTO %bb.2
+
+  bb.2:
+    %6:b32 = PHI %1, %bb.0, %4, %bb.1
+    ST_i32 %6, 0, 0, 1, 32, %0, 0
+    Return
+...

>From ced084c62f75c554ed0dd6087407f6203c11a1bb Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Wed, 24 Sep 2025 05:20:44 +0200
Subject: [PATCH 4/9] fix test

---
 .../NVPTX/machinelicm-no-preheader.mir        | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
index f2f0ffdec8094..7a1fc6dda5472 100644
--- a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
+++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
@@ -22,13 +22,14 @@ registers:
   - { id: 9, class: b64, preferred-register: '', flags: [  ] }
   - { id: 10, class: b32, preferred-register: '', flags: [  ] }
   - { id: 11, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 12, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 13, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 12, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 13, class: b32, preferred-register: '', flags: [  ] }
   - { id: 14, class: b64, preferred-register: '', flags: [  ] }
   - { id: 15, class: b64, preferred-register: '', flags: [  ] }
-  - { id: 16, class: b1, preferred-register: '', flags: [  ] }
+  - { id: 16, class: b64, preferred-register: '', flags: [  ] }
   - { id: 17, class: b1, preferred-register: '', flags: [  ] }
   - { id: 18, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: b1, preferred-register: '', flags: [  ] }
 body:             |
   bb.0.entry:
     successors: %bb.2(0x30000000), %bb.1(0x50000000)
@@ -38,14 +39,15 @@ body:             |
     %9:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
     %10:b32 = INT_PTX_SREG_CTAID_x
     %11:b32 = INT_PTX_SREG_NTID_x
-    %12:b32 = INT_PTX_SREG_TID_x
-    %13:b64 = CVT_u64_u32 killed %12, 0
-    %14:b64 = nuw MAD_WIDE_U32rrr killed %11, killed %10, killed %13
-    %15:b64 = nuw nsw SHL64_ri killed %14, 2
-    %0:b64 = nuw ADD64rr killed %9, killed %15
+    %12:b64 = MUL_WIDEu32_rr killed %11, killed %10
+    %13:b32 = INT_PTX_SREG_TID_x
+    %14:b64 = CVT_u64_u32 killed %13, 0
+    %15:b64 = nuw ADD64rr killed %12, killed %14
+    %16:b64 = nuw nsw SHL64_ri killed %15, 2
+    %0:b64 = nuw ADD64rr killed %9, killed %16
     %1:b32 = LD_i32 0, 0, 1, 3, 32, %0, 0
-    %16:b1 = SETP_i32ri %8, 0, 0
-    CBranch killed %16, %bb.2
+    %17:b1 = SETP_i32ri %8, 0, 0
+    CBranch killed %17, %bb.2
     GOTO %bb.1
 
   ; CHECK: bb.3:
@@ -61,8 +63,8 @@ body:             |
     %18:b32 = ADD32ri %7, -1
     %4:b32 = SREM32rr %3, %18
     %5:b32 = ADD32ri %2, -1
-    %17:b1 = SETP_i32ri %5, 0, 1
-    CBranch killed %17, %bb.1
+    %19:b1 = SETP_i32ri %5, 0, 1
+    CBranch killed %19, %bb.1
     GOTO %bb.2
 
   bb.2:

>From 865d70f72252f589f8232af8025c84ac4b387059 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Wed, 24 Sep 2025 07:27:16 +0200
Subject: [PATCH 5/9] remove lambda

---
 llvm/lib/CodeGen/MachineBasicBlock.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index efc8ad50b53d7..87d7e8048dec6 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1405,13 +1405,12 @@ bool MachineBasicBlock::canSplitCriticalEdge(const MachineBasicBlock *Succ,
   // where both sides of the branches are always executed.
   // However, if `Succ` is a loop header, splitting the critical edge will not
   // break structured CFG.
-  auto SuccIsLoopHeader = [&]() {
-    if (MLI)
-      if (MachineLoop *L = MLI->getLoopFor(Succ); L && L->getHeader() == Succ)
-        return true;
-    return false;
-  };
-  if (MF->getTarget().requiresStructuredCFG() && !SuccIsLoopHeader())
+  bool SuccIsLoopHeader = false;
+  if (MLI) {
+    const MachineLoop *L = MLI->getLoopFor(Succ);
+    SuccIsLoopHeader = L && L->getHeader() == Succ;
+  }
+  if (MF->getTarget().requiresStructuredCFG() && !SuccIsLoopHeader)
     return false;
 
   // Do we have an Indirect jump with a jumptable that we can rewrite?

>From f41c259d0e0a42533a3145e16a26f4a62d4392e8 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Wed, 24 Sep 2025 07:43:10 +0200
Subject: [PATCH 6/9] simplify test

---
 .../NVPTX/machinelicm-no-preheader.mir        | 86 ++++++++++---------
 1 file changed, 46 insertions(+), 40 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
index 7a1fc6dda5472..9d5895d5bee1b 100644
--- a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
+++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
@@ -1,8 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
 # RUN: llc -mtriple=nvptx64 -mcpu=sm_20 -run-pass=early-machinelicm %s -verify-machineinstrs -o - | FileCheck %s
 
 # This test checks that the early-machineLICM pass successfully creates a new
 # loop preheader by splitting the critical edge and hoisting the loop invariant
-# value `%18` to the preheader.
+# value `%8` to the preheader.
 # Since the critical edge successor is a loop header, the splitting does not
 # break the structured CFG, which is a requirement for the NVPTX target.
 
@@ -16,59 +17,64 @@ registers:
   - { id: 3, class: b32, preferred-register: '', flags: [  ] }
   - { id: 4, class: b32, preferred-register: '', flags: [  ] }
   - { id: 5, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 6, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 7, class: b32, preferred-register: '', flags: [  ] }
+  - { id: 6, class: b64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: b1, preferred-register: '', flags: [  ] }
   - { id: 8, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 9, class: b64, preferred-register: '', flags: [  ] }
-  - { id: 10, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 11, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 12, class: b64, preferred-register: '', flags: [  ] }
-  - { id: 13, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 14, class: b64, preferred-register: '', flags: [  ] }
-  - { id: 15, class: b64, preferred-register: '', flags: [  ] }
-  - { id: 16, class: b64, preferred-register: '', flags: [  ] }
-  - { id: 17, class: b1, preferred-register: '', flags: [  ] }
-  - { id: 18, class: b32, preferred-register: '', flags: [  ] }
-  - { id: 19, class: b1, preferred-register: '', flags: [  ] }
+  - { id: 9, class: b1, preferred-register: '', flags: [  ] }
 body:             |
+  ; CHECK-LABEL: name: test_hoist
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x30000000), %bb.3(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+  ; CHECK-NEXT:   [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+  ; CHECK-NEXT:   [[ADD64ri:%[0-9]+]]:b64 = nuw ADD64ri killed [[LD_i64_]], 2
+  ; CHECK-NEXT:   [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, [[ADD64ri]], 0
+  ; CHECK-NEXT:   [[SETP_i32ri:%[0-9]+]]:b1 = SETP_i32ri [[LD_i32_]], 0, 0
+  ; CHECK-NEXT:   CBranch killed [[SETP_i32ri]], %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADD32ri:%[0-9]+]]:b32 = ADD32ri [[LD_i32_]], -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:b32 = PHI [[LD_i32_1]], %bb.3, %3, %bb.1
+  ; CHECK-NEXT:   [[SREM32rr:%[0-9]+]]:b32 = SREM32rr [[PHI]], [[ADD32ri]]
+  ; CHECK-NEXT:   [[SETP_i32ri1:%[0-9]+]]:b1 = SETP_i32ri [[SREM32rr]], 0, 1
+  ; CHECK-NEXT:   CBranch killed [[SETP_i32ri1]], %bb.1
+  ; CHECK-NEXT:   GOTO %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:b32 = PHI [[LD_i32_1]], %bb.0, [[SREM32rr]], %bb.1
+  ; CHECK-NEXT:   ST_i32 [[PHI1]], 0, 0, 1, 32, [[ADD64ri]], 0
+  ; CHECK-NEXT:   Return
   bb.0.entry:
     successors: %bb.2(0x30000000), %bb.1(0x50000000)
 
-    %8:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_2, 0 :: (dereferenceable invariant load (s32), addrspace 101)
-    %7:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
-    %9:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
-    %10:b32 = INT_PTX_SREG_CTAID_x
-    %11:b32 = INT_PTX_SREG_NTID_x
-    %12:b64 = MUL_WIDEu32_rr killed %11, killed %10
-    %13:b32 = INT_PTX_SREG_TID_x
-    %14:b64 = CVT_u64_u32 killed %13, 0
-    %15:b64 = nuw ADD64rr killed %12, killed %14
-    %16:b64 = nuw nsw SHL64_ri killed %15, 2
-    %0:b64 = nuw ADD64rr killed %9, killed %16
+    %5:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+    %6:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+    %0:b64 = nuw ADD64ri killed %6, 2
     %1:b32 = LD_i32 0, 0, 1, 3, 32, %0, 0
-    %17:b1 = SETP_i32ri %8, 0, 0
-    CBranch killed %17, %bb.2
+    %7:b1 = SETP_i32ri %5, 0, 0
+    CBranch killed %7, %bb.2
     GOTO %bb.1
 
-  ; CHECK: bb.3:
-  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   %18:b32 = ADD32ri %7, -1
-  ; CHECK: bb.1:
 
   bb.1:
     successors: %bb.2(0x04000000), %bb.1(0x7c000000)
 
-    %2:b32 = PHI %8, %bb.0, %5, %bb.1
-    %3:b32 = PHI %1, %bb.0, %4, %bb.1
-    %18:b32 = ADD32ri %7, -1
-    %4:b32 = SREM32rr %3, %18
-    %5:b32 = ADD32ri %2, -1
-    %19:b1 = SETP_i32ri %5, 0, 1
-    CBranch killed %19, %bb.1
+    %2:b32 = PHI %1, %bb.0, %3, %bb.1
+    %8:b32 = ADD32ri %5, -1
+    %3:b32 = SREM32rr %2, %8
+    %9:b1 = SETP_i32ri %3, 0, 1
+    CBranch killed %9, %bb.1
     GOTO %bb.2
 
   bb.2:
-    %6:b32 = PHI %1, %bb.0, %4, %bb.1
-    ST_i32 %6, 0, 0, 1, 32, %0, 0
+    %4:b32 = PHI %1, %bb.0, %3, %bb.1
+    ST_i32 %4, 0, 0, 1, 32, %0, 0
     Return
 ...

>From c71b27eeb49b3a7693f5ac07406dce50e2e39b24 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 26 Sep 2025 07:42:32 +0800
Subject: [PATCH 7/9] Apply suggestion from @arsenm

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/lib/CodeGen/MachineBasicBlock.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 87d7e8048dec6..8adeb2f529a4b 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1403,15 +1403,17 @@ bool MachineBasicBlock::canSplitCriticalEdge(const MachineBasicBlock *Succ,
   const MachineFunction *MF = getParent();
   // Performance might be harmed on HW that implements branching using exec mask
   // where both sides of the branches are always executed.
-  // However, if `Succ` is a loop header, splitting the critical edge will not
-  // break structured CFG.
-  bool SuccIsLoopHeader = false;
-  if (MLI) {
-    const MachineLoop *L = MLI->getLoopFor(Succ);
-    SuccIsLoopHeader = L && L->getHeader() == Succ;
-  }
-  if (MF->getTarget().requiresStructuredCFG() && !SuccIsLoopHeader)
+
+  if (MF->getTarget().requiresStructuredCFG()) {
+    // If `Succ` is a loop header, splitting the critical edge will not
+    // break structured CFG.
+    if (MLI) {
+       const MachineLoop *L = MLI->getLoopFor(Succ);
+       return L && L->getHeader() == Succ;
+    }
+
     return false;
+  }
 
   // Do we have an Indirect jump with a jumptable that we can rewrite?
   int JTI = findJumpTableIndex(*this);

>From 1db818c2e752eebe3e6f781ad6d6126e9de32491 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 26 Sep 2025 07:42:45 +0800
Subject: [PATCH 8/9] Update
 llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
index 9d5895d5bee1b..0b2d85600a2ef 100644
--- a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
+++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=nvptx64 -mcpu=sm_20 -run-pass=early-machinelicm %s -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=nvptx64 -mcpu=sm_20 -run-pass=early-machinelicm %s -o - | FileCheck %s
 
 # This test checks that the early-machineLICM pass successfully creates a new
 # loop preheader by splitting the critical edge and hoisting the loop invariant

>From ec32c8b2a8774822567b6ebf34fe3c967ff19e6f Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 26 Sep 2025 01:51:14 +0200
Subject: [PATCH 9/9] clang-format

---
 llvm/lib/CodeGen/MachineBasicBlock.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 8adeb2f529a4b..71db679720997 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1408,8 +1408,8 @@ bool MachineBasicBlock::canSplitCriticalEdge(const MachineBasicBlock *Succ,
     // If `Succ` is a loop header, splitting the critical edge will not
     // break structured CFG.
     if (MLI) {
-       const MachineLoop *L = MLI->getLoopFor(Succ);
-       return L && L->getHeader() == Succ;
+      const MachineLoop *L = MLI->getLoopFor(Succ);
+      return L && L->getHeader() == Succ;
     }
 
     return false;