[llvm] r338235 - AMDGPU: Force skip over s_sendmsg and exp instructions

Mon Jul 30 02:23:59 PDT 2018

Author: nha
Date: Mon Jul 30 02:23:59 2018
New Revision: 338235

URL: http://llvm.org/viewvc/llvm-project?rev=338235&view=rev
Log:
AMDGPU: Force skip over s_sendmsg and exp instructions

Summary:
These instructions interact with hardware blocks outside the shader core,
and they can have "scalar" side effects even when EXEC = 0. We don't
want these scalar side effects to occur when all lanes want to skip
these instructions, so always add the execz skip branch instruction
for basic blocks that contain them.

Also ensure that we skip scalar stores / atomics, though we don't
code-gen those yet.

Reviewers: arsenm, rampitec

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D48431

Change-Id: Ieaeb58352e2789ffd64745603c14970c60819d44

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
    llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp?rev=338235&r1=338234&r2=338235&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp Mon Jul 30 02:23:59 2018
@@ -133,28 +133,10 @@ bool SIInsertSkips::shouldSkip(const Mac
           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
         return true;
 
-      // V_READFIRSTLANE/V_READLANE destination register may be used as operand
-      // by some SALU instruction. If exec mask is zero vector instruction
-      // defining the register that is used by the scalar one is not executed
-      // and scalar instruction will operate on undefined data. For
-      // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
-      if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
-          (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
+      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
         return true;
-      }
-
-      if (I->isInlineAsm()) {
-        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
-        const char *AsmStr = I->getOperand(0).getSymbolName();
-
-        // inlineasm length estimate is number of bytes assuming the longest
-        // instruction.
-        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
-        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
-      } else {
-        ++NumInstr;
-      }
 
+      ++NumInstr;
       if (NumInstr >= SkipThreshold)
         return true;
     }

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=338235&r1=338234&r2=338235&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Mon Jul 30 02:23:59 2018
@@ -2332,6 +2332,36 @@ bool SIInstrInfo::isSchedulingBoundary(c
          changesVGPRIndexingMode(MI);
 }
 
+bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+
+  if (MI.mayStore() && isSMRD(MI))
+    return true; // scalar store or atomic
+
+  // These instructions cause shader I/O that may cause hardware lockups
+  // when executed with an empty EXEC mask.
+  //
+  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
+  //       EXEC = 0, but checking for that case here seems not worth it
+  //       given the typical code patterns.
+  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
+      Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
+    return true;
+
+  if (MI.isInlineAsm())
+    return true; // conservative assumption
+
+  // These are like SALU instructions in terms of effects, so it's questionable
+  // whether we should return true for those.
+  //
+  // However, executing them with EXEC = 0 causes them to operate on undefined
+  // data, which we avoid by returning true here.
+  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
+    return true;
+
+  return false;
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   switch (Imm.getBitWidth()) {
   case 32:

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=338235&r1=338234&r2=338235&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Mon Jul 30 02:23:59 2018
@@ -597,6 +597,9 @@ public:
     return !RI.isSGPRReg(MRI, Dest);
   }
 
+  /// Whether we must prevent this instruction from executing with EXEC = 0.
+  bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
+
   bool isInlineConstant(const APInt &Imm) const;
 
   bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll?rev=338235&r1=338234&r2=338235&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll Mon Jul 30 02:23:59 2018
@@ -480,5 +480,65 @@ define amdgpu_kernel void @test_export_v
   ret void
 }
 
+; GCN-LABEL: {{^}}test_if_export_f32:
+; GCN: s_cbranch_execz
+; GCN: exp
+define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false)
+  br label %end
+
+end:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_if_export_vm_f32:
+; GCN: s_cbranch_execz
+; GCN: exp
+define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 true)
+  br label %end
+
+end:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_if_export_done_f32:
+; GCN: s_cbranch_execz
+; GCN: exp
+define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  br label %end
+
+end:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_if_export_vm_done_f32:
+; GCN: s_cbranch_execz
+; GCN: exp
+define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %end, label %exp
+
+exp:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+  br label %end
+
+end:
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind inaccessiblememonly }

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll?rev=338235&r1=338234&r2=338235&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll Mon Jul 30 02:23:59 2018
@@ -136,6 +136,21 @@ body:
   ret void
 }
 
+; GCN-LABEL: {{^}}if_sendmsg:
+; GCN: s_cbranch_execz
+; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define amdgpu_gs void @if_sendmsg(i32 %flag) #0 {
+  %cc = icmp eq i32 %flag, 0
+  br i1 %cc, label %sendmsg, label %end
+
+sendmsg:
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
+  br label %end
+
+end:
+  ret void
+}
+
 declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
 declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll?rev=338235&r1=338234&r2=338235&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll Mon Jul 30 02:23:59 2018
@@ -72,10 +72,18 @@ define amdgpu_ps void @test_kill_depth_v
 ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
 ; CHECK-NEXT: ; %bb.0:
 ; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cbranch_execnz BB6_2
 ; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: exp
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB6_2:
 ; CHECK: v_mov_b32_e64 v7, -1
 ; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB6_4
+; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: exp
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB6_4:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
   call void @llvm.AMDGPU.kill(float %x)