[llvm] Revert "[RegAlloc] Relax the split constrain on MBB prolog" (PR #169990)

via llvm-commits llvm-commits at lists.llvm.org
Sat Nov 29 04:17:47 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-regalloc

Author: theRonShark (ronlieb)

<details>
<summary>Changes</summary>

Reverts llvm/llvm-project#<!-- -->168259

breaks hip buildot

---

Patch is 560.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169990.diff


11 Files Affected:

- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+7-2) 
- (modified) llvm/lib/CodeGen/SplitKit.cpp (-48) 
- (modified) llvm/lib/CodeGen/SplitKit.h (-8) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+2662-2707) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+95-92) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+143-150) 
- (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir (+31-31) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-before-exec.mir (-5) 
- (removed) llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir (-167) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+268-269) 
- (modified) llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll (+38-40) 


``````````diff
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4db20dc39fb32..a059cb55371a3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -774,7 +774,8 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
       // Abort if the spill cannot be inserted at the MBB' start
       if (((BC.Entry == SpillPlacement::MustSpill) ||
            (BC.Entry == SpillPlacement::PrefSpill)) &&
-          !SA->canSplitBeforeProlog(BC.Number))
+          SlotIndex::isEarlierInstr(BI.FirstInstr,
+                                    SA->getFirstSplitPoint(BC.Number)))
         return false;
     }
 
@@ -829,7 +830,11 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
     BCS[B].Number = Number;
 
     // Abort if the spill cannot be inserted at the MBB' start
-    if (!SA->canSplitBeforeProlog(Number))
+    MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
+    auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
+    if (FirstNonDebugInstr != MBB->end() &&
+        SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
+                                  SA->getFirstSplitPoint(Number)))
       return false;
     // Interference for the live-in value.
     if (Intf.first() <= Indexes->getMBBStartIdx(Number))
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index f27ff674dcf8c..8ec4bfbb5a330 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -147,54 +147,6 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
   return LIS.getInstructionFromIndex(LIP);
 }
 
-bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
-                                               const MachineBasicBlock &MBB) {
-  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-
-  for (auto &MI : MBB) {
-    if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
-        MI.isPseudoProbe())
-      continue;
-
-    if (!TII->isBasicBlockPrologue(MI))
-      return true;
-
-    for (auto &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
-        continue;
-
-      // For the AMDGPU target if a MBB contains exec mask restore preamble,
-      // SplitEditor may get state when it cannot insert a spill instruction
-      // at the begin of the MBB.
-      // E.g. for a MIR
-      // bb.100:
-      //     %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
-      //          implicit $exec
-      //     ...
-      //     use %1
-      // If the regalloc try to allocate a virtreg to the physreg already
-      // assigned to virtreg %1 and the pyhsreg is computed as the best
-      // candidate for split, it may insert COPY instruction.
-      //  bb.100:
-      //     %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
-      //          implicit $exec
-      //     %2 = COPY %orig
-      //     ...
-      //     use %1
-      // Thus %1 and %orig still have interference. We may add cost for the
-      // physreg candidate or abandon the candidate.
-      const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-      const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
-      const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
-      if (TRI->getCommonSubClass(RC, CurRC))
-        return false;
-    }
-  }
-
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 //                                 Split Analysis
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index a9fc921534d0e..de255911268f2 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -89,9 +89,6 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
     return Res;
   }
 
-  /// Return true if we can split \pCurLI before \pMBB's prolog.
-  bool canSplitBeforeProlog(const LiveInterval &CurLI,
-                            const MachineBasicBlock &MBB);
 };
 
 /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@@ -250,11 +247,6 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
   SlotIndex getFirstSplitPoint(unsigned Num) {
     return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
   }
-
-  bool canSplitBeforeProlog(unsigned Num) {
-    MachineBasicBlock *BB = MF.getBlockNumbered(Num);
-    return IPA.canSplitBeforeProlog(*CurLI, *BB);
-  }
 };
 
 /// SplitEditor - Edit machine code and LiveIntervals for live range
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 10f7b701c3122..4c5c56a49fdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -151238,13 +151238,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v41, s30, 0
+; SI-NEXT:    s_mov_b32 s73, s21
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v44, s19, 0
 ; SI-NEXT:    v_writelane_b32 v44, s18, 1
 ; SI-NEXT:    v_writelane_b32 v44, s17, 2
 ; SI-NEXT:    v_writelane_b32 v44, s16, 3
+; SI-NEXT:    v_writelane_b32 v41, s30, 0
 ; SI-NEXT:    v_writelane_b32 v41, s31, 1
 ; SI-NEXT:    v_writelane_b32 v41, s34, 2
 ; SI-NEXT:    v_writelane_b32 v41, s35, 3
@@ -151268,8 +151268,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s69, 21
 ; SI-NEXT:    v_writelane_b32 v41, s70, 22
 ; SI-NEXT:    v_writelane_b32 v41, s71, 23
-; SI-NEXT:    s_mov_b32 s57, s28
-; SI-NEXT:    s_mov_b32 s47, s27
+; SI-NEXT:    s_mov_b32 s74, s29
+; SI-NEXT:    s_mov_b32 s78, s28
+; SI-NEXT:    s_mov_b32 s76, s27
 ; SI-NEXT:    v_writelane_b32 v41, s80, 24
 ; SI-NEXT:    v_writelane_b32 v41, s81, 25
 ; SI-NEXT:    v_writelane_b32 v41, s82, 26
@@ -151279,6 +151280,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
+; SI-NEXT:    s_mov_b32 s47, s26
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
@@ -151288,101 +151290,95 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s90, v9
-; SI-NEXT:    v_writelane_b32 v42, s89, 0
-; SI-NEXT:    v_readfirstlane_b32 s91, v10
-; SI-NEXT:    v_writelane_b32 v42, s90, 1
-; SI-NEXT:    v_readfirstlane_b32 s92, v8
-; SI-NEXT:    v_writelane_b32 v42, s91, 2
-; SI-NEXT:    v_readfirstlane_b32 s93, v7
-; SI-NEXT:    v_writelane_b32 v42, s92, 3
-; SI-NEXT:    v_readfirstlane_b32 s94, v13
-; SI-NEXT:    v_writelane_b32 v42, s93, 4
-; SI-NEXT:    v_readfirstlane_b32 s95, v14
-; SI-NEXT:    v_writelane_b32 v42, s94, 5
-; SI-NEXT:    v_writelane_b32 v42, s95, 6
-; SI-NEXT:    v_readfirstlane_b32 s30, v17
-; SI-NEXT:    v_readfirstlane_b32 s31, v18
-; SI-NEXT:    v_readfirstlane_b32 s34, v16
-; SI-NEXT:    v_readfirstlane_b32 s35, v15
-; SI-NEXT:    v_readfirstlane_b32 s36, v21
 ; SI-NEXT:    v_readfirstlane_b32 s37, v22
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s38, v20
+; SI-NEXT:    v_writelane_b32 v43, s37, 0
 ; SI-NEXT:    v_readfirstlane_b32 s39, v19
+; SI-NEXT:    v_writelane_b32 v43, s38, 1
 ; SI-NEXT:    v_readfirstlane_b32 s48, v25
+; SI-NEXT:    v_writelane_b32 v43, s39, 2
 ; SI-NEXT:    v_readfirstlane_b32 s49, v26
+; SI-NEXT:    v_writelane_b32 v43, s48, 3
 ; SI-NEXT:    v_readfirstlane_b32 s50, v24
+; SI-NEXT:    v_writelane_b32 v43, s49, 4
 ; SI-NEXT:    v_readfirstlane_b32 s51, v23
+; SI-NEXT:    v_writelane_b32 v43, s50, 5
 ; SI-NEXT:    v_readfirstlane_b32 s52, v29
+; SI-NEXT:    v_writelane_b32 v43, s51, 6
 ; SI-NEXT:    v_readfirstlane_b32 s53, v30
+; SI-NEXT:    v_writelane_b32 v43, s52, 7
+; SI-NEXT:    v_readfirstlane_b32 s54, v28
+; SI-NEXT:    v_writelane_b32 v43, s53, 8
+; SI-NEXT:    v_readfirstlane_b32 s55, v27
+; SI-NEXT:    v_writelane_b32 v43, s54, 9
+; SI-NEXT:    v_writelane_b32 v43, s55, 10
+; SI-NEXT:    s_mov_b32 s57, s24
+; SI-NEXT:    v_readfirstlane_b32 s16, v1
+; SI-NEXT:    v_readfirstlane_b32 s17, v2
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 5
+; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v44, s4, 6
+; SI-NEXT:    v_writelane_b32 v44, s4, 5
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 7
+; SI-NEXT:    v_writelane_b32 v44, s4, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 8
+; SI-NEXT:    v_writelane_b32 v44, s4, 7
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 9
+; SI-NEXT:    v_writelane_b32 v44, s4, 8
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    v_writelane_b32 v44, s4, 10
+; SI-NEXT:    v_writelane_b32 v44, s4, 9
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 11
-; SI-NEXT:    v_readfirstlane_b32 s54, v28
-; SI-NEXT:    v_readfirstlane_b32 s55, v27
-; SI-NEXT:    s_mov_b32 s6, s23
-; SI-NEXT:    s_mov_b32 s23, s21
-; SI-NEXT:    s_mov_b32 s58, s26
-; SI-NEXT:    s_mov_b32 s40, s25
-; SI-NEXT:    s_mov_b32 s25, s24
-; SI-NEXT:    v_readfirstlane_b32 s16, v1
-; SI-NEXT:    v_readfirstlane_b32 s17, v2
+; SI-NEXT:    v_writelane_b32 v44, s4, 10
 ; SI-NEXT:    v_readfirstlane_b32 s18, v5
 ; SI-NEXT:    v_readfirstlane_b32 s19, v6
 ; SI-NEXT:    v_readfirstlane_b32 s77, v4
-; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s26, v53
-; SI-NEXT:    v_readfirstlane_b32 s46, v54
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s61, v55
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
+; SI-NEXT:    v_readfirstlane_b32 s91, v10
+; SI-NEXT:    v_readfirstlane_b32 s92, v8
+; SI-NEXT:    v_readfirstlane_b32 s93, v7
+; SI-NEXT:    v_readfirstlane_b32 s94, v13
+; SI-NEXT:    v_readfirstlane_b32 s95, v14
+; SI-NEXT:    v_readfirstlane_b32 s30, v17
+; SI-NEXT:    v_readfirstlane_b32 s31, v18
+; SI-NEXT:    v_readfirstlane_b32 s34, v16
+; SI-NEXT:    v_readfirstlane_b32 s35, v15
+; SI-NEXT:    v_readfirstlane_b32 s36, v21
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s62, v40
+; SI-NEXT:    v_readfirstlane_b32 s24, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 12
+; SI-NEXT:    v_writelane_b32 v44, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 13
+; SI-NEXT:    v_writelane_b32 v44, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 14
+; SI-NEXT:    v_writelane_b32 v44, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 15
+; SI-NEXT:    v_writelane_b32 v44, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 16
+; SI-NEXT:    v_writelane_b32 v44, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -151392,51 +151388,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:228
-; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 18
+; SI-NEXT:    v_readfirstlane_b32 s75, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s21, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v44, s4, 19
+; SI-NEXT:    v_writelane_b32 v44, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 21
+; SI-NEXT:    v_readfirstlane_b32 s40, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 22
+; SI-NEXT:    v_readfirstlane_b32 s61, v36
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_writelane_b32 v44, s4, 23
+; SI-NEXT:    v_readfirstlane_b32 s63, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:204
+; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 24
+; SI-NEXT:    v_readfirstlane_b32 s59, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 25
+; SI-NEXT:    v_readfirstlane_b32 s56, v38
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 26
+; SI-NEXT:    v_readfirstlane_b32 s43, v39
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 27
+; SI-NEXT:    v_readfirstlane_b32 s46, v48
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 28
+; SI-NEXT:    v_readfirstlane_b32 s42, v49
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 29
+; SI-NEXT:    v_readfirstlane_b32 s13, v50
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    v_readfirstlane_b32 s45, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -151444,47 +151429,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
-; SI-NEXT:    v_writelane_b32 v44, s4, 30
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 31
+; SI-NEXT:    v_readfirstlane_b32 s88, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s79, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    v_writelane_b32 v44, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 33
+; SI-NEXT:    v_writelane_b32 v44, s4, 18
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 34
+; SI-NEXT:    v_writelane_b32 v44, s4, 19
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 35
+; SI-NEXT:    v_writelane_b32 v44, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s43, v37
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    v_writelane_b32 v44, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 36
+; SI-NEXT:    v_writelane_b32 v44, s4, 22
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 37
+; SI-NEXT:    v_writelane_b32 v44, s4, 23
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 38
+; SI-NEXT:    v_writelane_b32 v44, s4, 24
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 39
+; SI-NEXT:    v_writelane_b32 v44, s4, 25
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 40
+; SI-NEXT:    v_writelane_b32 v44, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 41
+; SI-NEXT:    v_writelane_b32 v44, s4, 27
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
-; SI-NEXT:    v_writelane_b32 v44, s4, 42
+; SI-NEXT:    v_writelane_b32 v44, s4, 28
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
 ; SI-NEXT:    s_waitcnt vmcnt(3)
@@ -151500,31 +151483,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT:    v_writelane_b32 v44, s4, 43
-; SI-NEXT:    v_writelane_b32 v44, s22, 44
-; SI-NEXT:    v_writelane_b32 v44, s6, 45
-; SI-NEXT:    v_writelane_b32 v44, s23, 46
-; SI-NEXT:    v_writelane_b32 v44, s20, 47
-; SI-NEXT:    v_writelane_b32 v44, s58, 48
-; SI-NEXT:    v_writelane_b32 v44, s47, 49
-; SI-NEXT:    v_writelane_b32 v44, s40, 50
-; SI-NEXT:    v_writelane_b32 v44, s25, 51
-; SI-NEXT:    v_writelane_b32 v44, s29, 52
-; SI-NEXT:    v_writelane_b32 v44, s57, 53
-; SI-NEXT:    v_writelane_b32 v44, s62, 54
+; SI-NEXT:    v_writelane_b32 v44, s4, 29
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s21, v52
-; SI-NEXT:    v_writelane_b32 v44, s61, 55
-; SI-NEXT:    v_writelane_b32 v44, s21, 56
-; SI-NEXT:    v_writelane_b32 v44, s26, 57
-; SI-NEXT:    v_writelane_b32 v44, s46, 58
-; SI-NEXT:    v_writelane_b32 v44, s16, 59
-; SI-NEXT:    v_writelane_b32 v44, s17, 60
-; SI-NEXT:    v_writelane_b32 v44, s18, 61
-; SI-NEXT:    v_writelane_b32 v44, s...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/169990


More information about the llvm-commits mailing list