[PATCH] D39171: AMDGPU: Handle s_buffer_load_dword hazard on SI

Marek Olšák via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 22 14:13:06 PDT 2017


mareko created this revision.
Herald added subscribers: t-tye, tpr, dstuttard, yaxunl, wdng, kzhuravl.

https://reviews.llvm.org/D39171

Files:
  lib/Target/AMDGPU/GCNHazardRecognizer.cpp
  test/CodeGen/AMDGPU/smrd.ll


Index: test/CodeGen/AMDGPU/smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd.ll
+++ test/CodeGen/AMDGPU/smrd.ll
@@ -85,6 +85,23 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}smrd_hazard:
+; GCN-DAG: s_mov_b32 s3, 3
+; GCN-DAG: s_mov_b32 s2, 2
+; GCN-DAG: s_mov_b32 s1, 1
+; GCN-DAG: s_mov_b32 s0, 0
+; SI-NEXT: nop 3
+; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
+define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 {
+main_body:
+  %d0 = insertelement <4 x i32> undef, i32 0, i32 0
+  %d1 = insertelement <4 x i32> %d0, i32 1, i32 1
+  %d2 = insertelement <4 x i32> %d1, i32 2, i32 2
+  %d3 = insertelement <4 x i32> %d2, i32 3, i32 3
+  %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %d3, i32 0)
+  ret float %r
+}
+
 ; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp
===================================================================
--- lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -343,6 +343,36 @@
         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   }
+
+  // This fixes what appears to be undocumented hardware behavior in SI where
+  // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
+  // needs some number of nops in between. We don't know how many we need, but
+  // let's use 4. This wasn't discovered before probably because the only
+  // case when this happens is when we expand a 64-bit pointer into a full
+  // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
+  // probably never encountered in the closed-source land.
+  if (SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORD_SGPR ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR ||
+      SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR) {
+     int SmrdSgprWaitStates = 4;
+     auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
+
+     for (const MachineOperand &Use : SMRD->uses()) {
+        if (!Use.isReg())
+           continue;
+        int WaitStatesNeededForUse =
+              SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+     }
+  }
+
   return WaitStatesNeeded;
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D39171.119799.patch
Type: text/x-patch
Size: 3034 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20171022/eaf65b1c/attachment.bin>


More information about the llvm-commits mailing list