[llvm] ce7fd49 - [AMDGPU] RA inserted scalar instructions can be at the BB top (#72140)

via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 15 21:00:07 PST 2023


Author: Christudasan Devadasan
Date: 2023-11-16T10:30:03+05:30
New Revision: ce7fd498ed91344c23f0864bbd5b84d65eaae3ef

URL: https://github.com/llvm/llvm-project/commit/ce7fd498ed91344c23f0864bbd5b84d65eaae3ef
DIFF: https://github.com/llvm/llvm-project/commit/ce7fd498ed91344c23f0864bbd5b84d65eaae3ef.diff

LOG: [AMDGPU] RA inserted scalar instructions can be at the BB top (#72140)

We adjust the insertion point at the BB top for spills/copies during RA
to ensure they are placed after the exec restore instructions required
for the divergent control flow execution. This is, however, required
only for the vector operations. The insertions for scalar registers can
still go to the BB top.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/MachineBasicBlock.h
    llvm/include/llvm/CodeGen/TargetInstrInfo.h
    llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
    llvm/lib/CodeGen/InlineSpiller.cpp
    llvm/lib/CodeGen/MachineBasicBlock.cpp
    llvm/lib/CodeGen/SplitKit.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
    llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 4b5336fac33ea46..5812295f73b5a2e 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -846,8 +846,10 @@ class MachineBasicBlock
 
   /// Return the first instruction in MBB after I that is not a PHI, label or
   /// debug.  This is the correct point to insert copies at the beginning of a
-  /// basic block.
-  iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp = true);
+  /// basic block. \p Reg is the register being used by a spill or defined for a
+  /// restore/split during register allocation.
+  iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg = Register(),
+                                  bool SkipPseudoOp = true);
 
   /// Returns an iterator to the first terminator instruction of this basic
   /// block. If a terminator does not exist, it returns end().

diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 8e7499ac626a747..c83c11d4e776fab 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1988,8 +1988,10 @@ class TargetInstrInfo : public MCInstrInfo {
 
   /// True if the instruction is bound to the top of its basic block and no
   /// other instructions shall be inserted before it. This can be implemented
-  /// to prevent register allocator to insert spills before such instructions.
-  virtual bool isBasicBlockPrologue(const MachineInstr &MI) const {
+  /// to prevent register allocator to insert spills for \p Reg before such
+  /// instructions.
+  virtual bool isBasicBlockPrologue(const MachineInstr &MI,
+                                    Register Reg = Register()) const {
     return false;
   }
 

diff  --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index 75504ef32250c52..4d668c53f7156b8 100644
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -461,7 +461,8 @@ class StatepointState {
 
       if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) {
         RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad);
-        auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin());
+        auto EHPadInsertPoint =
+            EHPad->SkipPHIsLabelsAndDebug(EHPad->begin(), Reg);
         insertReloadBefore(Reg, EHPadInsertPoint, EHPad);
         LLVM_DEBUG(dbgs() << "...also reload at EHPad "
                           << printMBBReference(*EHPad) << "\n");

diff  --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 71d58b2e9e18d7d..2740265f75340b5 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -469,7 +469,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
   MachineBasicBlock::iterator MII;
   if (SrcVNI->isPHIDef())
-    MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin());
+    MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin(), SrcReg);
   else {
     MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
     assert(DefMI && "Defining instruction disappeared");

diff  --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index d9e22685faf5f5e..4410fb7ecd23b64 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -223,13 +223,13 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
 
 MachineBasicBlock::iterator
 MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I,
-                                          bool SkipPseudoOp) {
+                                          Register Reg, bool SkipPseudoOp) {
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
 
   iterator E = end();
   while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() ||
                     (SkipPseudoOp && I->isPseudoProbe()) ||
-                    TII->isBasicBlockPrologue(*I)))
+                    TII->isBasicBlockPrologue(*I, Reg)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels / dbg_values
   // inside the bundle.

diff  --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 1664c304f643c3f..b1c862210932bc3 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -795,8 +795,10 @@ SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
     return Start;
   }
 
-  VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
-                              MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
+  unsigned RegIdx = 0;
+  Register Reg = LIS.getInterval(Edit->get(RegIdx)).reg();
+  VNInfo *VNI = defFromParent(RegIdx, ParentVNI, Start, MBB,
+                              MBB.SkipPHIsLabelsAndDebug(MBB.begin(), Reg));
   RegAssign.insert(Start, VNI->def, OpenIdx);
   LLVM_DEBUG(dump());
   return VNI->def;

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5f78dfff1e98852..2751c6b4ea9987c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8476,16 +8476,25 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
   return AMDGPU::COPY;
 }
 
-bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
+bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
+                                       Register Reg) const {
   // We need to handle instructions which may be inserted during register
   // allocation to handle the prolog. The initial prolog instruction may have
   // been separated from the start of the block by spills and copies inserted
-  // needed by the prolog.
-  uint16_t Opc = MI.getOpcode();
+  // needed by the prolog. However, the insertions for scalar registers can
+  // always be placed at the BB top as they are independent of the exec mask
+  // value.
+  bool IsNullOrVectorRegister = true;
+  if (Reg) {
+    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+    IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
+  }
 
+  uint16_t Opc = MI.getOpcode();
   // FIXME: Copies inserted in the block prolog for live-range split should also
   // be included.
-  return (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
+  return IsNullOrVectorRegister &&
+         (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
                                  MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 29f549fc29a3ce6..de2820e5c013ee3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1179,7 +1179,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   unsigned getLiveRangeSplitOpcode(Register Reg,
                                    const MachineFunction &MF) const override;
 
-  bool isBasicBlockPrologue(const MachineInstr &MI) const override;
+  bool isBasicBlockPrologue(const MachineInstr &MI,
+                            Register Reg = Register()) const override;
 
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator InsPt,

diff  --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 8098304d134229d..ffbf00765adbe22 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -168,7 +168,6 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_or_saveexec_b64 s[20:21], s[26:27]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s12, v5, 32
 ; CHECK-NEXT:    v_readlane_b32 s13, v5, 33
@@ -178,39 +177,39 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s17, v5, 37
 ; CHECK-NEXT:    v_readlane_b32 s18, v5, 38
 ; CHECK-NEXT:    v_readlane_b32 s19, v5, 39
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 56
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 57
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 58
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 59
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 60
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 61
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 62
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 63
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 40
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 41
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 42
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 43
-; CHECK-NEXT:    v_writelane_b32 v5, s56, 44
-; CHECK-NEXT:    v_writelane_b32 v5, s57, 45
-; CHECK-NEXT:    v_writelane_b32 v5, s58, 46
-; CHECK-NEXT:    v_writelane_b32 v5, s59, 47
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s60, 48
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s61, 49
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s62, 50
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s63, 51
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s64, 52
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s65, 53
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s66, 54
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s67, 55
-; CHECK-NEXT:    s_xor_b64 exec, exec, s[20:21]
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 40
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 41
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 42
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 43
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 44
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 45
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 46
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 47
+; CHECK-NEXT:    v_writelane_b32 v5, s12, 48
+; CHECK-NEXT:    v_writelane_b32 v5, s13, 49
+; CHECK-NEXT:    v_writelane_b32 v5, s14, 50
+; CHECK-NEXT:    v_writelane_b32 v5, s15, 51
+; CHECK-NEXT:    v_writelane_b32 v5, s16, 52
+; CHECK-NEXT:    v_writelane_b32 v5, s17, 53
+; CHECK-NEXT:    v_writelane_b32 v5, s18, 54
+; CHECK-NEXT:    v_writelane_b32 v5, s19, 55
+; CHECK-NEXT:    v_writelane_b32 v5, s52, 56
+; CHECK-NEXT:    v_writelane_b32 v4, s60, 0
+; CHECK-NEXT:    v_writelane_b32 v5, s53, 57
+; CHECK-NEXT:    v_writelane_b32 v4, s61, 1
+; CHECK-NEXT:    v_writelane_b32 v5, s54, 58
+; CHECK-NEXT:    v_writelane_b32 v4, s62, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s55, 59
+; CHECK-NEXT:    v_writelane_b32 v4, s63, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s56, 60
+; CHECK-NEXT:    v_writelane_b32 v4, s64, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s57, 61
+; CHECK-NEXT:    v_writelane_b32 v4, s65, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s58, 62
+; CHECK-NEXT:    v_writelane_b32 v4, s66, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s59, 63
+; CHECK-NEXT:    v_writelane_b32 v4, s67, 7
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
@@ -265,39 +264,35 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[22:23]
+; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
+; CHECK-NEXT:    v_readlane_b32 s52, v5, 40
+; CHECK-NEXT:    v_readlane_b32 s53, v5, 41
+; CHECK-NEXT:    v_readlane_b32 s54, v5, 42
+; CHECK-NEXT:    v_readlane_b32 s55, v5, 43
+; CHECK-NEXT:    v_readlane_b32 s56, v5, 44
+; CHECK-NEXT:    v_readlane_b32 s57, v5, 45
+; CHECK-NEXT:    v_readlane_b32 s58, v5, 46
+; CHECK-NEXT:    v_readlane_b32 s59, v5, 47
+; CHECK-NEXT:    v_readlane_b32 s60, v5, 48
+; CHECK-NEXT:    v_readlane_b32 s61, v5, 49
+; CHECK-NEXT:    v_readlane_b32 s62, v5, 50
+; CHECK-NEXT:    v_readlane_b32 s63, v5, 51
+; CHECK-NEXT:    v_readlane_b32 s64, v5, 52
+; CHECK-NEXT:    v_readlane_b32 s65, v5, 53
+; CHECK-NEXT:    v_readlane_b32 s66, v5, 54
+; CHECK-NEXT:    v_readlane_b32 s67, v5, 55
+; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 40
 ; CHECK-NEXT:    s_mov_b32 s7, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 41
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 56
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 42
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 43
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 44
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 45
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 46
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 47
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 48
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 49
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 50
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 51
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 52
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 53
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 54
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 55
-; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 56
 ; CHECK-NEXT:    v_readlane_b32 s37, v5, 57
 ; CHECK-NEXT:    v_readlane_b32 s38, v5, 58
 ; CHECK-NEXT:    v_readlane_b32 s39, v5, 59
@@ -305,19 +300,25 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s41, v5, 61
 ; CHECK-NEXT:    v_readlane_b32 s42, v5, 62
 ; CHECK-NEXT:    v_readlane_b32 s43, v5, 63
+; CHECK-NEXT:    s_nop 4
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[52:59], s[8:11] dmask:0x1
 ; CHECK-NEXT:    ; kill: killed $vgpr2_vgpr3
+; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
 ; CHECK-NEXT:    v_readlane_b32 s44, v4, 0
 ; CHECK-NEXT:    v_readlane_b32 s45, v4, 1
 ; CHECK-NEXT:    v_readlane_b32 s46, v4, 2
 ; CHECK-NEXT:    v_readlane_b32 s47, v4, 3
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[36:43], s[8:11] dmask:0x1
 ; CHECK-NEXT:    v_readlane_b32 s48, v4, 4
 ; CHECK-NEXT:    v_readlane_b32 s49, v4, 5
 ; CHECK-NEXT:    v_readlane_b32 s50, v4, 6
 ; CHECK-NEXT:    v_readlane_b32 s51, v4, 7
+; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
 ; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT:    ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
+; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
 ; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v2, v7, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir b/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir
index a5cceb622d3a4e7..dca9ffad7e800c5 100644
--- a/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir
@@ -1,6 +1,8 @@
-# RUN: not llc --crash -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -run-pass=greedy -filetype=null %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -run-pass=greedy --stress-regalloc=6 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
-; This test would crash while trying to split a liverange during register allocator.
+# The spills/copies during RA for scalar register block LiveIns should be inserted at the beginning of the block.
+# The COPY inserted in bb.9 during liverange split should precede the SPILL that was inserted earlier in the flow.
 
 ---
 name:            test_kernel
@@ -129,14 +131,297 @@ machineFunctionInfo:
   stackPtrOffsetReg: '$sgpr32'
   sgprForEXECCopy: '$sgpr105'
 body:             |
- bb.0:
+  ; GCN-LABEL: name: test_kernel
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead undef [[DEF1:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE $sgpr1, %stack.15, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.15, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr0
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[COPY]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
+  ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_64 = IMPLICIT_DEF
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_1]], %stack.19, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.19, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_2:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
+  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_1]], %stack.17, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.17, addrspace 5)
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   KILL [[DEF2]]
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef [[V_READFIRSTLANE_B32_2]], 132, 0 :: ("amdgpu-noclobber" load (s128), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S128_SAVE [[S_LOAD_DWORDX4_IMM]], %stack.14, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.14, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef [[V_READFIRSTLANE_B32_2]], 188, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit undef $scc
+  ; GCN-NEXT:   S_BRANCH %bb.3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3:
+  ; GCN-NEXT:   successors: %bb.4(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 -1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.4:
+  ; GCN-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_]], %stack.9, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.9, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM undef [[V_READFIRSTLANE_B32_2]], 120, 0 :: ("amdgpu-noclobber" load (s64), align 16, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[S_LOAD_DWORDX2_IMM]], %stack.18, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.18, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM1:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef [[V_READFIRSTLANE_B32_2]], 352, 0 :: ("amdgpu-noclobber" load (s256), align 16, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM1]], %stack.10, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.10, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %97:sreg_64, 0, 0
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM]], %stack.11, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.11, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM2:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef [[V_READFIRSTLANE_B32_2]], 652, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM2]], %stack.6, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.6, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_MOV_B64_]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_2]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM3:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[V_READFIRSTLANE_B32_2]], 688, 0 :: ("amdgpu-noclobber" load (s256), align 16, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM3]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.4, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit undef $scc
+  ; GCN-NEXT:   S_BRANCH %bb.5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.5:
+  ; GCN-NEXT:   successors: %bb.6(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 -1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.6:
+  ; GCN-NEXT:   successors: %bb.7(0x40000000), %bb.10(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_3]], %stack.5, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.5, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM undef %123:sgpr_64, 0, 0 :: ("amdgpu-noclobber" load (s32), align 16, addrspace 1)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM4:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %124:sgpr_64, 152, 0 :: ("amdgpu-noclobber" load (s256), align 4, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM4]], %stack.20, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.20, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM5:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %125:sgpr_64, 220, 0 :: ("amdgpu-noclobber" load (s256), align 4, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM5]], %stack.16, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.16, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM6:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %126:sgpr_64, 384, 0 :: ("amdgpu-noclobber" load (s256), align 4, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM6]], %stack.13, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.13, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM undef %127:sgpr_64, 440, 0 :: ("amdgpu-noclobber" load (s512), align 8, addrspace 1)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM undef %128:sgpr_64, 584, 0 :: ("amdgpu-noclobber" load (s512), align 16, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S512_SAVE [[S_LOAD_DWORDX16_IMM1]], %stack.12, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.12, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM7:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[V_READFIRSTLANE_B32_]], 156, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM7]], %stack.8, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.8, align 4, addrspace 5)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.19, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.19, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM [[SI_SPILL_S64_RESTORE]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM3]], %stack.7, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.7, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GCN-NEXT:   dead [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY1]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1)
+  ; GCN-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_MOV_B64_1]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE1:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY2:%[0-9]+]].sub1:sgpr_64 = COPY [[SI_SPILL_S64_RESTORE1]].sub1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.10, implicit undef $scc
+  ; GCN-NEXT:   S_BRANCH %bb.7
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.7:
+  ; GCN-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[COPY2]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_3:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF1]].sub0, implicit $exec
+  ; GCN-NEXT:   dead [[V_READFIRSTLANE_B32_3:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF1]].sub1, implicit $exec
+  ; GCN-NEXT:   [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GCN-NEXT:   $vcc = COPY [[DEF3]]
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.9, implicit $vcc
+  ; GCN-NEXT:   S_BRANCH %bb.8
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.8:
+  ; GCN-NEXT:   successors: %bb.9(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 -1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.9:
+  ; GCN-NEXT:   successors: %bb.10(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = COPY [[S_MOV_B32_4]]
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.10:
+  ; GCN-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORD_IMM2]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.17, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.17, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_]], [[SI_SPILL_S32_RESTORE]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.15, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.15, addrspace 5)
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE1]], 0, implicit-def $scc
+  ; GCN-NEXT:   dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE2:%[0-9]+]]:sreg_64_xexec = SI_SPILL_S64_RESTORE %stack.18, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.18, align 4, addrspace 5)
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S64_RESTORE2]].sub1, 0, implicit-def $scc
+  ; GCN-NEXT:   dead [[DEF5:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.20, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.20, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY3:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub0 {
+  ; GCN-NEXT:     internal [[COPY3]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub2
+  ; GCN-NEXT:     internal [[COPY3]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub4
+  ; GCN-NEXT:     internal [[COPY3]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub7
+  ; GCN-NEXT:   }
+  ; GCN-NEXT:   dead [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY3]].sub7, [[S_LOAD_DWORD_IMM5]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY3]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY3]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_3:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY3]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LOAD_DWORDX8_IMM]].sub0, undef [[S_OR_B32_]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_4:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_5:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_6:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_7:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_8:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_9:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.14, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.14, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY4:%[0-9]+]].sub0_sub1_sub2:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0_sub1_sub2
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_10:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_11:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_12:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF5]], [[DEF6]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[DEF9:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF11]], undef [[DEF11]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE1:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.16, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.16, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY5:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub0 {
+  ; GCN-NEXT:     internal [[COPY5]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub2
+  ; GCN-NEXT:     internal [[COPY5]].sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub5
+  ; GCN-NEXT:     internal [[COPY5]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub7
+  ; GCN-NEXT:   }
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_13:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_AND_B32_3:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_8]], undef [[V_CMP_GT_F32_e64_9]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_14:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY5]].sub5, [[COPY5]].sub7, implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE2:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.10, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.10, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY6:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
+  ; GCN-NEXT:   dead [[S_OR_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub0, [[COPY6]].sub1, implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_OR_B32_4:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub2, undef [[S_OR_B32_3]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE2:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.9, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.9, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_4:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_3]], [[SI_SPILL_S32_RESTORE2]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_15:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_16:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_17:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_18:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE3:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.11, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.11, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_19:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE3]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE3:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.13, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.13, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY7:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub0 {
+  ; GCN-NEXT:     internal [[COPY7]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub2
+  ; GCN-NEXT:     internal [[COPY7]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub4
+  ; GCN-NEXT:     internal [[COPY7]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub7
+  ; GCN-NEXT:   }
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_20:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_21:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_22:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_AND_B32_5:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF12]], undef [[V_CMP_GT_F32_e64_20]], implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[COPY7]].sub7, 0, implicit-def $scc
+  ; GCN-NEXT:   undef [[COPY8:%[0-9]+]].sub0:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub0 {
+  ; GCN-NEXT:     internal [[COPY8]].sub2:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub2
+  ; GCN-NEXT:     internal [[COPY8]].sub4:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub4
+  ; GCN-NEXT:     internal [[COPY8]].sub6:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub6
+  ; GCN-NEXT:     internal [[COPY8]].sub9:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub9
+  ; GCN-NEXT:     internal [[COPY8]].sub10:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub10
+  ; GCN-NEXT:     internal [[COPY8]].sub13:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub13
+  ; GCN-NEXT:     internal [[COPY8]].sub14:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub14
+  ; GCN-NEXT:   }
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_23:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_24:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_25:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_26:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_AND_B32_6:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_23]], undef [[V_CMP_GT_F32_e64_23]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_OR_B32_5:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY8]].sub10, [[COPY8]].sub9, implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_27:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub13, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_28:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub14, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.12, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.12, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY9:%[0-9]+]].sub1:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub1 {
+  ; GCN-NEXT:     internal [[COPY9]].sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub5
+  ; GCN-NEXT:     internal [[COPY9]].sub6:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub6
+  ; GCN-NEXT:     internal [[COPY9]].sub9:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub9
+  ; GCN-NEXT:     internal [[COPY9]].sub10:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10
+  ; GCN-NEXT:     internal [[COPY9]].sub12:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub12
+  ; GCN-NEXT:     internal [[COPY9]].sub15:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub15
+  ; GCN-NEXT:   }
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[COPY9]].sub1, 0, implicit-def $scc
+  ; GCN-NEXT:   dead [[DEF13:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_29:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_30:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[DEF14:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_31:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub9, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_32:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub10, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[DEF15:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[S_AND_B32_7:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF15]], undef [[DEF14]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_33:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub12, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE4:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.6, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.6, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY10:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
+  ; GCN-NEXT:   dead [[S_OR_B32_6:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY10]].sub0, [[COPY9]].sub15, implicit-def dead $scc
+  ; GCN-NEXT:   dead [[DEF16:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_34:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_35:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[DEF17:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_36:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_37:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[DEF18:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_38:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_39:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_AND_B32_8:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF18]], undef [[DEF17]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE5:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.4, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY11:%[0-9]+]].sub0_sub1_sub2_sub3_sub4_sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE5]].sub0_sub1_sub2_sub3_sub4_sub5
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_40:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_41:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE4:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_42:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE4]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_43:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_44:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_OR_B32_7:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY11]].sub4, [[COPY11]].sub5, implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE4]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE5:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.5, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_9:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_7]], [[SI_SPILL_S32_RESTORE5]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY2]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE6:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.8, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.8, align 4, addrspace 5)
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S256_RESTORE6]].sub7, 0, implicit-def $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE6:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.7, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.7, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_45:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE6]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF19:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   dead [[S_AND_B32_10:%[0-9]+]]:sreg_32 = S_AND_B32 [[DEF19]], undef [[S_LOAD_DWORD_IMM6]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_AND_B32_11:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_AND_B32_10]], [[S_MOV_B32_2]], implicit-def dead $scc
+  ; GCN-NEXT:   $vcc = COPY undef [[S_AND_B32_11]]
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.12, implicit $vcc
+  ; GCN-NEXT:   S_BRANCH %bb.11
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.11:
+  ; GCN-NEXT:   successors: %bb.12(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.12:
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE3:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE4:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE4]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
     successors: %bb.1, %bb.2
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
 
     %0:vgpr_32 = IMPLICIT_DEF
     undef %1.sub1:vreg_64 = IMPLICIT_DEF
-    %109:sgpr_32 = COPY undef $sgpr1
-    undef %93.sub1:sgpr_64 = COPY undef $sgpr0
+    %109:sgpr_32 = COPY $sgpr1
+    undef %93.sub1:sgpr_64 = COPY $sgpr0
     undef %106.sub0:sgpr_64 = V_READFIRSTLANE_B32 undef %0, implicit $exec
     %106.sub1:sgpr_64 = V_READFIRSTLANE_B32 undef %0, implicit $exec
     undef %105.sub0:sgpr_64 = V_READFIRSTLANE_B32 undef %0, implicit $exec


        


More information about the llvm-commits mailing list