[llvm] r363465 - AMDGPU: Avoid most waitcnts before calls

Fri Jun 14 14:52:27 PDT 2019

Author: arsenm
Date: Fri Jun 14 14:52:26 2019
New Revision: 363465

URL: http://llvm.org/viewvc/llvm-project?rev=363465&view=rev
Log:
AMDGPU: Avoid most waitcnts before calls

Currently you get extra waits, because waits are inserted for the
register dependencies of the call, and the function prolog waits on
everything.

Currently waits are still inserted on returns. It may make sense to
not do this, and wait in the caller instead.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/trunk/test/CodeGen/AMDGPU/call-waitcnt.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=363465&r1=363464&r2=363465&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Fri Jun 14 14:52:26 2019
@@ -808,6 +808,21 @@ static bool readsVCCZ(const MachineInstr
          !MI.getOperand(1).isUndef();
 }
 
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+  // Currently all conventions wait, but this may not always be the case.
+  //
+  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+  // senses to omit the wait and do it in the caller.
+  return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
+  return true;
+}
+
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -843,7 +858,8 @@ bool SIInsertWaitcnts::generateWaitcntIn
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
-      MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+      (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
   }
   // Resolve vm waits before gs-done.
@@ -923,91 +939,91 @@ bool SIInsertWaitcnts::generateWaitcntIn
       }
     }
 
-#if 0 // TODO: the following code to handle CALL.
-    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
-    // However, there is a problem with EXP_CNT, because the call cannot
-    // easily tell if a register is used in the function, and if it did, then
-    // the referring instruction would have to have an S_WAITCNT, which is
-    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
-    // before the call.
-    if (MI.getOpcode() == SC_CALL) {
-      if (ScoreBrackets->getScoreUB(EXP_CNT) >
-        ScoreBrackets->getScoreLB(EXP_CNT)) {
-        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitWaitcnt |= CNT_MASK(EXP_CNT);
-      }
-    }
-#endif
-
-    // FIXME: Should not be relying on memoperands.
-    // Look at the source operands of every instruction to see if
-    // any of them results from a previous memory operation that affects
-    // its current usage. If so, an s_waitcnt instruction needs to be
-    // emitted.
-    // If the source operand was defined by a load, add the s_waitcnt
-    // instruction.
-    for (const MachineMemOperand *Memop : MI.memoperands()) {
-      unsigned AS = Memop->getAddrSpace();
-      if (AS != AMDGPUAS::LOCAL_ADDRESS)
-        continue;
-      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-      // VM_CNT is only relevant to vgpr or LDS.
-      ScoreBrackets.determineWait(
-          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-    }
-
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      const MachineOperand &Op = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+    if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+      // Don't bother waiting on anything except the call address. The function
+      // is going to insert a wait on everything in its prolog. This still needs
+      // to be careful if the call target is a load (e.g. a GOT load).
+      Wait = AMDGPU::Waitcnt();
+
+      int CallAddrOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+      RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
+                                                          CallAddrOpIdx, false);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Op.getReg())) {
-          // VM_CNT is only relevant to vgpr or LDS.
-          ScoreBrackets.determineWait(
-              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        }
         ScoreBrackets.determineWait(
             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
-    }
-    // End of for loop that looks at all source operands to decide vm_wait_cnt
-    // and lgk_wait_cnt.
-
-    // Two cases are handled for destination operands:
-    // 1) If the destination operand was defined by a load, add the s_waitcnt
-    // instruction to guarantee the right WAW order.
-    // 2) If a destination operand that was used by a recent export/store ins,
-    // add s_waitcnt on exp_cnt to guarantee the WAR order.
-    if (MI.mayStore()) {
+    } else {
       // FIXME: Should not be relying on memoperands.
+      // Look at the source operands of every instruction to see if
+      // any of them results from a previous memory operation that affects
+      // its current usage. If so, an s_waitcnt instruction needs to be
+      // emitted.
+      // If the source operand was defined by a load, add the s_waitcnt
+      // instruction.
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         unsigned AS = Memop->getAddrSpace();
         if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        // VM_CNT is only relevant to vgpr or LDS.
         ScoreBrackets.determineWait(
             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        ScoreBrackets.determineWait(
-            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
       }
-    }
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Def = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
-      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Def.getReg())) {
+
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Op.getReg())) {
+            // VM_CNT is only relevant to vgpr or LDS.
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      }
+      // End of for loop that looks at all source operands to decide vm_wait_cnt
+      // and lgk_wait_cnt.
+
+      // Two cases are handled for destination operands:
+      // 1) If the destination operand was defined by a load, add the s_waitcnt
+      // instruction to guarantee the right WAW order.
+      // 2) If a destination operand that was used by a recent export/store ins,
+      // add s_waitcnt on exp_cnt to guarantee the WAR order.
+      if (MI.mayStore()) {
+        // FIXME: Should not be relying on memoperands.
+        for (const MachineMemOperand *Memop : MI.memoperands()) {
+          unsigned AS = Memop->getAddrSpace();
+          if (AS != AMDGPUAS::LOCAL_ADDRESS)
+            continue;
+          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
           ScoreBrackets.determineWait(
               VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
           ScoreBrackets.determineWait(
               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
-        ScoreBrackets.determineWait(
-            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
-    } // End of for loop that looks at all dest operands.
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        MachineOperand &Def = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Def.getReg())) {
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+            ScoreBrackets.determineWait(
+                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      } // End of for loop that looks at all dest operands.
+    }
   }
 
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
@@ -1224,6 +1240,14 @@ void SIInsertWaitcnts::updateEventWaitcn
     }
   } else if (TII->isSMRD(Inst)) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else if (Inst.isCall()) {
+    if (callWaitsOnFunctionReturn(Inst)) {
+      // Act as a wait on everything
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+    } else {
+      // May need to way wait for anything.
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+    }
   } else {
     switch (Inst.getOpcode()) {
     case AMDGPU::S_SENDMSG:

Modified: llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll?rev=363465&r1=363464&r2=363465&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll Fri Jun 14 14:52:26 2019
@@ -154,7 +154,7 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s3
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
@@ -174,7 +174,7 @@ define amdgpu_kernel void @test_call_ext
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
@@ -205,7 +205,7 @@ define amdgpu_kernel void @test_call_ext
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
@@ -224,7 +224,7 @@ define amdgpu_kernel void @test_call_ext
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
@@ -267,8 +267,8 @@ define amdgpu_kernel void @test_call_ext
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
 ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* null
   call void @external_void_func_v2i64(<2 x i64> %val)
@@ -290,8 +290,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN: buffer_load_dwordx4 v[0:3]
 ; GCN: v_mov_b32_e32 v4, 1
 ; GCN: v_mov_b32_e32 v5, 2
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
@@ -307,8 +307,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-DAG: v_mov_b32_e32 v6, 3
 ; GCN-DAG: v_mov_b32_e32 v7, 4
 
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -483,8 +483,8 @@ define amdgpu_kernel void @test_call_ext
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
 ; GCN: buffer_load_dwordx2 v[0:1]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
   call void @external_void_func_v2i32(<2 x i32> %val)
@@ -527,8 +527,8 @@ define amdgpu_kernel void @test_call_ext
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
 ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
   call void @external_void_func_v4i32(<4 x i32> %val)
@@ -562,8 +562,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
   %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
@@ -591,8 +591,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
   %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
@@ -609,8 +609,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-DAG: buffer_load_dwordx4 v[20:23], off
 ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
   %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
@@ -647,12 +647,11 @@ define amdgpu_kernel void @test_call_ext
   ret void
 }
 
-; FIXME: No wait after call
 ; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
 ; GCN: v_mov_b32_e32 v0, 42
 ; GCN: s_swappc_b64 s[30:31],
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
+; GCN-NOT: s_waitcnt
+; GCN: buffer_store_dword v0, off, s[36:39], 0
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
   %val = call i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, i32 addrspace(1)* %out
@@ -662,8 +661,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
 ; GCN: buffer_load_ubyte v0, off
 ; GCN: buffer_load_dword v1, off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0

Modified: llvm/trunk/test/CodeGen/AMDGPU/call-waitcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-waitcnt.ll?rev=363465&r1=363464&r2=363465&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-waitcnt.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-waitcnt.ll Fri Jun 14 14:52:26 2019
@@ -17,7 +17,6 @@ define amdgpu_kernel void @call_memory_a
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func at rel32@hi+4
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
@@ -67,7 +66,6 @@ define amdgpu_kernel void @call_no_wait_
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NEXT:    v_mov_b32_e32 v1, s35
 ; GCN-NEXT:    global_store_dword v[0:1], v32, off
@@ -91,7 +89,6 @@ define amdgpu_kernel void @call_no_wait_
 ; GCN-NEXT:    s_addc_u32 s7, s7, func.return at rel32@hi+4
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NEXT:    global_store_dword v[1:2], v0, off
@@ -138,7 +135,7 @@ define void @tailcall_got_load(i32 addrs
   ret void
 }
 
-; Need to wait for the address dependency
+; No need to wait for the load.
 define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
 ; GCN-LABEL: tail_call_memory_arg_load:
 ; GCN:       ; %bb.0:
@@ -147,7 +144,6 @@ define void @tail_call_memory_arg_load(i
 ; GCN-NEXT:    s_add_u32 s6, s6, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func at rel32@hi+4
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[6:7]
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
   tail call void @func(i32 %vgpr)