[llvm] 890c76a - [AMDGPU] Fix odd implicit operand handling in clause breaking

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri May 19 13:24:39 PDT 2023


Author: Jay Foad
Date: 2023-05-19T21:24:33+01:00
New Revision: 890c76a9314a0b2fca7341b9307955ed83dccfc0

URL: https://github.com/llvm/llvm-project/commit/890c76a9314a0b2fca7341b9307955ed83dccfc0
DIFF: https://github.com/llvm/llvm-project/commit/890c76a9314a0b2fca7341b9307955ed83dccfc0.diff

LOG: [AMDGPU] Fix odd implicit operand handling in clause breaking

By inspection. Because of the strange behaviour of MI.uses(), this was
adding implicit defs to the clause *uses* set, and then wrongly
detecting a conflict between explicit defs and implicit defs.

For example it would detect a conflict on this pair of instructions:

   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4088, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1, addrspace 5)
   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4092, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1 + 4, addrspace 5)

Differential Revision: https://reviews.llvm.org/D150947

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
    llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 53196c39f296..e03c5b5a59ed 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -594,17 +594,15 @@ static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
 
 static void addRegsToSet(const SIRegisterInfo &TRI,
                          iterator_range<MachineInstr::const_mop_iterator> Ops,
-                         BitVector &Set) {
+                         BitVector &DefSet, BitVector &UseSet) {
   for (const MachineOperand &Op : Ops) {
     if (Op.isReg())
-      addRegUnits(TRI, Set, Op.getReg().asMCReg());
+      addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
   }
 }
 
 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
-  // XXX: Do we need to worry about implicit operands
-  addRegsToSet(TRI, MI.defs(), ClauseDefs);
-  addRegsToSet(TRI, MI.uses(), ClauseUses);
+  addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
 }
 
 static bool breaksSMEMSoftClause(MachineInstr *MI) {

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 0a1c7eb5e5a6..e8ceeece372d 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -206,7 +206,6 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a1
 ; GFX90A-NEXT:    buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -1110,7 +1109,6 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a1
 ; GFX90A-NEXT:    buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 3df5344ae908..2cbd3e0b382f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -171,10 +171,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_writelane_b32 v5, s5, 2
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
@@ -546,10 +544,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_writelane_b32 v13, s5, 2
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
@@ -599,10 +595,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_writelane_b32 v13, s5, 10
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
@@ -645,15 +639,10 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_4
 ; W64-O0-NEXT:  ; %bb.6:
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    v_readlane_b32 s4, v13, 9
 ; W64-O0-NEXT:    v_readlane_b32 s5, v13, 10
@@ -1040,10 +1029,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
@@ -1114,10 +1101,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_writelane_b32 v8, s5, 14
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
@@ -1169,7 +1154,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    v_readlane_b32 s4, v8, 10
 ; W64-O0-NEXT:    v_readlane_b32 s5, v8, 11

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 67ed07878b7f..e9d39b3fe445 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -290,7 +290,6 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
@@ -357,7 +356,6 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
@@ -523,7 +521,6 @@ define void @test_sgpr_offset_subregs_function() {
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
@@ -590,7 +587,6 @@ define void @test_inst_offset_subregs_function() {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 8a951ca9f62f..6a1a3afce1f5 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -178,7 +178,6 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
@@ -598,23 +597,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -671,16 +661,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
@@ -774,7 +759,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -829,15 +813,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
@@ -1195,23 +1174,14 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
@@ -1318,23 +1288,14 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v25, s29
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)


        


More information about the llvm-commits mailing list