[llvm] Revert "[RegAlloc] Relax the split constrain on MBB prolog" (PR #169990)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 29 04:17:47 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-regalloc
Author: theRonShark (ronlieb)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->168259
breaks hip buildot
---
Patch is 560.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169990.diff
11 Files Affected:
- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+7-2)
- (modified) llvm/lib/CodeGen/SplitKit.cpp (-48)
- (modified) llvm/lib/CodeGen/SplitKit.h (-8)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+2662-2707)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+95-92)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+143-150)
- (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir (+31-31)
- (modified) llvm/test/CodeGen/AMDGPU/spill-before-exec.mir (-5)
- (removed) llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir (-167)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+268-269)
- (modified) llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll (+38-40)
``````````diff
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4db20dc39fb32..a059cb55371a3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -774,7 +774,8 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
// Abort if the spill cannot be inserted at the MBB' start
if (((BC.Entry == SpillPlacement::MustSpill) ||
(BC.Entry == SpillPlacement::PrefSpill)) &&
- !SA->canSplitBeforeProlog(BC.Number))
+ SlotIndex::isEarlierInstr(BI.FirstInstr,
+ SA->getFirstSplitPoint(BC.Number)))
return false;
}
@@ -829,7 +830,11 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
BCS[B].Number = Number;
// Abort if the spill cannot be inserted at the MBB' start
- if (!SA->canSplitBeforeProlog(Number))
+ MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
+ auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
+ if (FirstNonDebugInstr != MBB->end() &&
+ SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
+ SA->getFirstSplitPoint(Number)))
return false;
// Interference for the live-in value.
if (Intf.first() <= Indexes->getMBBStartIdx(Number))
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index f27ff674dcf8c..8ec4bfbb5a330 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -147,54 +147,6 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
return LIS.getInstructionFromIndex(LIP);
}
-bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
- const MachineBasicBlock &MBB) {
- const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-
- for (auto &MI : MBB) {
- if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
- MI.isPseudoProbe())
- continue;
-
- if (!TII->isBasicBlockPrologue(MI))
- return true;
-
- for (auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
- continue;
-
- // For the AMDGPU target if a MBB contains exec mask restore preamble,
- // SplitEditor may get state when it cannot insert a spill instruction
- // at the begin of the MBB.
- // E.g. for a MIR
- // bb.100:
- // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
- // implicit $exec
- // ...
- // use %1
- // If the regalloc try to allocate a virtreg to the physreg already
- // assigned to virtreg %1 and the pyhsreg is computed as the best
- // candidate for split, it may insert COPY instruction.
- // bb.100:
- // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
- // implicit $exec
- // %2 = COPY %orig
- // ...
- // use %1
- // Thus %1 and %orig still have interference. We may add cost for the
- // physreg candidate or abandon the candidate.
- const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
- const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
- const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
- if (TRI->getCommonSubClass(RC, CurRC))
- return false;
- }
- }
-
- return true;
-}
-
//===----------------------------------------------------------------------===//
// Split Analysis
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index a9fc921534d0e..de255911268f2 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -89,9 +89,6 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
return Res;
}
- /// Return true if we can split \pCurLI before \pMBB's prolog.
- bool canSplitBeforeProlog(const LiveInterval &CurLI,
- const MachineBasicBlock &MBB);
};
/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@@ -250,11 +247,6 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
SlotIndex getFirstSplitPoint(unsigned Num) {
return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
}
-
- bool canSplitBeforeProlog(unsigned Num) {
- MachineBasicBlock *BB = MF.getBlockNumbered(Num);
- return IPA.canSplitBeforeProlog(*CurLI, *BB);
- }
};
/// SplitEditor - Edit machine code and LiveIntervals for live range
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 10f7b701c3122..4c5c56a49fdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -151238,13 +151238,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_writelane_b32 v41, s30, 0
+; SI-NEXT: s_mov_b32 s73, s21
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v44, s19, 0
; SI-NEXT: v_writelane_b32 v44, s18, 1
; SI-NEXT: v_writelane_b32 v44, s17, 2
; SI-NEXT: v_writelane_b32 v44, s16, 3
+; SI-NEXT: v_writelane_b32 v41, s30, 0
; SI-NEXT: v_writelane_b32 v41, s31, 1
; SI-NEXT: v_writelane_b32 v41, s34, 2
; SI-NEXT: v_writelane_b32 v41, s35, 3
@@ -151268,8 +151268,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s69, 21
; SI-NEXT: v_writelane_b32 v41, s70, 22
; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: s_mov_b32 s57, s28
-; SI-NEXT: s_mov_b32 s47, s27
+; SI-NEXT: s_mov_b32 s74, s29
+; SI-NEXT: s_mov_b32 s78, s28
+; SI-NEXT: s_mov_b32 s76, s27
; SI-NEXT: v_writelane_b32 v41, s80, 24
; SI-NEXT: v_writelane_b32 v41, s81, 25
; SI-NEXT: v_writelane_b32 v41, s82, 26
@@ -151279,6 +151280,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s86, 30
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
+; SI-NEXT: s_mov_b32 s47, s26
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
@@ -151288,101 +151290,95 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT: v_readfirstlane_b32 s90, v9
-; SI-NEXT: v_writelane_b32 v42, s89, 0
-; SI-NEXT: v_readfirstlane_b32 s91, v10
-; SI-NEXT: v_writelane_b32 v42, s90, 1
-; SI-NEXT: v_readfirstlane_b32 s92, v8
-; SI-NEXT: v_writelane_b32 v42, s91, 2
-; SI-NEXT: v_readfirstlane_b32 s93, v7
-; SI-NEXT: v_writelane_b32 v42, s92, 3
-; SI-NEXT: v_readfirstlane_b32 s94, v13
-; SI-NEXT: v_writelane_b32 v42, s93, 4
-; SI-NEXT: v_readfirstlane_b32 s95, v14
-; SI-NEXT: v_writelane_b32 v42, s94, 5
-; SI-NEXT: v_writelane_b32 v42, s95, 6
-; SI-NEXT: v_readfirstlane_b32 s30, v17
-; SI-NEXT: v_readfirstlane_b32 s31, v18
-; SI-NEXT: v_readfirstlane_b32 s34, v16
-; SI-NEXT: v_readfirstlane_b32 s35, v15
-; SI-NEXT: v_readfirstlane_b32 s36, v21
; SI-NEXT: v_readfirstlane_b32 s37, v22
+; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s38, v20
+; SI-NEXT: v_writelane_b32 v43, s37, 0
; SI-NEXT: v_readfirstlane_b32 s39, v19
+; SI-NEXT: v_writelane_b32 v43, s38, 1
; SI-NEXT: v_readfirstlane_b32 s48, v25
+; SI-NEXT: v_writelane_b32 v43, s39, 2
; SI-NEXT: v_readfirstlane_b32 s49, v26
+; SI-NEXT: v_writelane_b32 v43, s48, 3
; SI-NEXT: v_readfirstlane_b32 s50, v24
+; SI-NEXT: v_writelane_b32 v43, s49, 4
; SI-NEXT: v_readfirstlane_b32 s51, v23
+; SI-NEXT: v_writelane_b32 v43, s50, 5
; SI-NEXT: v_readfirstlane_b32 s52, v29
+; SI-NEXT: v_writelane_b32 v43, s51, 6
; SI-NEXT: v_readfirstlane_b32 s53, v30
+; SI-NEXT: v_writelane_b32 v43, s52, 7
+; SI-NEXT: v_readfirstlane_b32 s54, v28
+; SI-NEXT: v_writelane_b32 v43, s53, 8
+; SI-NEXT: v_readfirstlane_b32 s55, v27
+; SI-NEXT: v_writelane_b32 v43, s54, 9
+; SI-NEXT: v_writelane_b32 v43, s55, 10
+; SI-NEXT: s_mov_b32 s57, s24
+; SI-NEXT: v_readfirstlane_b32 s16, v1
+; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s6, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT: v_writelane_b32 v44, s4, 4
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v44, s4, 5
+; SI-NEXT: v_writelane_b32 v44, s4, 4
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT: v_writelane_b32 v44, s4, 6
+; SI-NEXT: v_writelane_b32 v44, s4, 5
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v44, s4, 7
+; SI-NEXT: v_writelane_b32 v44, s4, 6
; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v44, s4, 8
+; SI-NEXT: v_writelane_b32 v44, s4, 7
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 9
+; SI-NEXT: v_writelane_b32 v44, s4, 8
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT: v_writelane_b32 v44, s4, 10
+; SI-NEXT: v_writelane_b32 v44, s4, 9
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 11
-; SI-NEXT: v_readfirstlane_b32 s54, v28
-; SI-NEXT: v_readfirstlane_b32 s55, v27
-; SI-NEXT: s_mov_b32 s6, s23
-; SI-NEXT: s_mov_b32 s23, s21
-; SI-NEXT: s_mov_b32 s58, s26
-; SI-NEXT: s_mov_b32 s40, s25
-; SI-NEXT: s_mov_b32 s25, s24
-; SI-NEXT: v_readfirstlane_b32 s16, v1
-; SI-NEXT: v_readfirstlane_b32 s17, v2
+; SI-NEXT: v_writelane_b32 v44, s4, 10
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
; SI-NEXT: v_readfirstlane_b32 s77, v4
-; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT: v_readfirstlane_b32 s26, v53
-; SI-NEXT: v_readfirstlane_b32 s46, v54
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s61, v55
+; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: v_readfirstlane_b32 s90, v9
+; SI-NEXT: v_readfirstlane_b32 s91, v10
+; SI-NEXT: v_readfirstlane_b32 s92, v8
+; SI-NEXT: v_readfirstlane_b32 s93, v7
+; SI-NEXT: v_readfirstlane_b32 s94, v13
+; SI-NEXT: v_readfirstlane_b32 s95, v14
+; SI-NEXT: v_readfirstlane_b32 s30, v17
+; SI-NEXT: v_readfirstlane_b32 s31, v18
+; SI-NEXT: v_readfirstlane_b32 s34, v16
+; SI-NEXT: v_readfirstlane_b32 s35, v15
+; SI-NEXT: v_readfirstlane_b32 s36, v21
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s62, v40
+; SI-NEXT: v_readfirstlane_b32 s24, v40
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 12
+; SI-NEXT: v_writelane_b32 v44, s4, 11
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 13
+; SI-NEXT: v_writelane_b32 v44, s4, 12
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 14
+; SI-NEXT: v_writelane_b32 v44, s4, 13
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 15
+; SI-NEXT: v_writelane_b32 v44, s4, 14
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 16
+; SI-NEXT: v_writelane_b32 v44, s4, 15
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -151392,51 +151388,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228
-; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v44, s4, 18
+; SI-NEXT: v_readfirstlane_b32 s75, v32
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s4, v33
+; SI-NEXT: v_readfirstlane_b32 s21, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v44, s4, 19
+; SI-NEXT: v_writelane_b32 v44, s4, 16
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v44, s4, 20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v44, s4, 21
+; SI-NEXT: v_readfirstlane_b32 s40, v35
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 22
+; SI-NEXT: v_readfirstlane_b32 s61, v36
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s4, v37
-; SI-NEXT: v_writelane_b32 v44, s4, 23
+; SI-NEXT: v_readfirstlane_b32 s63, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204
+; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 24
+; SI-NEXT: v_readfirstlane_b32 s59, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 25
+; SI-NEXT: v_readfirstlane_b32 s56, v38
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 26
+; SI-NEXT: v_readfirstlane_b32 s43, v39
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 27
+; SI-NEXT: v_readfirstlane_b32 s46, v48
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 28
+; SI-NEXT: v_readfirstlane_b32 s42, v49
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 29
+; SI-NEXT: v_readfirstlane_b32 s13, v50
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s4, v51
+; SI-NEXT: v_readfirstlane_b32 s45, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -151444,47 +151429,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
-; SI-NEXT: v_writelane_b32 v44, s4, 30
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v44, s4, 31
+; SI-NEXT: v_readfirstlane_b32 s88, v32
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s4, v33
+; SI-NEXT: v_readfirstlane_b32 s79, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT: v_writelane_b32 v44, s4, 32
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v44, s4, 33
+; SI-NEXT: v_writelane_b32 v44, s4, 18
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v44, s4, 34
+; SI-NEXT: v_writelane_b32 v44, s4, 19
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 35
+; SI-NEXT: v_writelane_b32 v44, s4, 20
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s43, v37
+; SI-NEXT: v_readfirstlane_b32 s4, v37
+; SI-NEXT: v_writelane_b32 v44, s4, 21
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 36
+; SI-NEXT: v_writelane_b32 v44, s4, 22
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 37
+; SI-NEXT: v_writelane_b32 v44, s4, 23
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 38
+; SI-NEXT: v_writelane_b32 v44, s4, 24
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 39
+; SI-NEXT: v_writelane_b32 v44, s4, 25
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 40
+; SI-NEXT: v_writelane_b32 v44, s4, 26
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 41
+; SI-NEXT: v_writelane_b32 v44, s4, 27
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s4, v51
-; SI-NEXT: v_writelane_b32 v44, s4, 42
+; SI-NEXT: v_writelane_b32 v44, s4, 28
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(3)
@@ -151500,31 +151483,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT: v_writelane_b32 v44, s4, 43
-; SI-NEXT: v_writelane_b32 v44, s22, 44
-; SI-NEXT: v_writelane_b32 v44, s6, 45
-; SI-NEXT: v_writelane_b32 v44, s23, 46
-; SI-NEXT: v_writelane_b32 v44, s20, 47
-; SI-NEXT: v_writelane_b32 v44, s58, 48
-; SI-NEXT: v_writelane_b32 v44, s47, 49
-; SI-NEXT: v_writelane_b32 v44, s40, 50
-; SI-NEXT: v_writelane_b32 v44, s25, 51
-; SI-NEXT: v_writelane_b32 v44, s29, 52
-; SI-NEXT: v_writelane_b32 v44, s57, 53
-; SI-NEXT: v_writelane_b32 v44, s62, 54
+; SI-NEXT: v_writelane_b32 v44, s4, 29
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s21, v52
-; SI-NEXT: v_writelane_b32 v44, s61, 55
-; SI-NEXT: v_writelane_b32 v44, s21, 56
-; SI-NEXT: v_writelane_b32 v44, s26, 57
-; SI-NEXT: v_writelane_b32 v44, s46, 58
-; SI-NEXT: v_writelane_b32 v44, s16, 59
-; SI-NEXT: v_writelane_b32 v44, s17, 60
-; SI-NEXT: v_writelane_b32 v44, s18, 61
-; SI-NEXT: v_writelane_b32 v44, s...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169990
More information about the llvm-commits
mailing list