[llvm] ca8b20c - [AMDGPU] need to insert wait between the scalar load and vector store to the same address to avoid WAR conflict.

Sat Jan 4 07:25:55 PST 2020

Author: alex-t
Date: 2020-01-04T18:23:14+03:00
New Revision: ca8b20ca3ba10288b61a083c4ce57fb011124935

URL: https://github.com/llvm/llvm-project/commit/ca8b20ca3ba10288b61a083c4ce57fb011124935
DIFF: https://github.com/llvm/llvm-project/commit/ca8b20ca3ba10288b61a083c4ce57fb011124935.diff

LOG: [AMDGPU] need to insert wait between the scalar load and vector store to the same address to avoid WAR conflict.

Reviewers: rampitec, vpykhtin, nhaehnle

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D71934

Added: 
    llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 927826c52404..ef662d55cb0a 100644

--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -42,7 +42,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -372,6 +374,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   AMDGPU::IsaVersion IV;
 
   DenseSet<MachineInstr *> TrackedWaitcntSet;
+  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+  MachinePostDominatorTree *PDT;
 
   struct BlockInfo {
     MachineBasicBlock *MBB;
@@ -406,6 +410,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -792,6 +797,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                       false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                     false)
 
@@ -1012,6 +1018,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       if (MI.mayStore()) {
         // FIXME: Should not be relying on memoperands.
         for (const MachineMemOperand *Memop : MI.memoperands()) {
+          const Value *Ptr = Memop->getValue();
+          if (SLoadAddresses.count(Ptr)) {
+            addWait(Wait, LGKM_CNT, 0);
+            if (PDT->dominates(MI.getParent(),
+                               SLoadAddresses.find(Ptr)->second))
+              SLoadAddresses.erase(Ptr);
+          }
           unsigned AS = Memop->getAddrSpace();
           if (AS != AMDGPUAS::LOCAL_ADDRESS)
             continue;
@@ -1399,6 +1412,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
     }
 
+    if (TII->isSMRD(Inst)) {
+      for (const MachineMemOperand *Memop : Inst.memoperands()) {
+        const Value *Ptr = Memop->getValue();
+        SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+      }
+    }
+
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1448,6 +1468,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())

diff  --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
new file mode 100644
index 000000000000..4ba16b4eb30b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -0,0 +1,29 @@
+; RUN: llc  -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+
+; GCN-LABEL: BB0_1
+; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off
+
+define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = icmp eq i32 %tmp, 0
+  br i1 %tmp2, label %bb3, label %bb8
+
+bb3:                                              ; preds = %bb
+  %tmp4 = load i32, i32 addrspace(1)* %arg, align 4
+  store i32 0, i32 addrspace(1)* %arg, align 4
+  %tmp5 = zext i32 %tmp4 to i64
+  %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8
+  %tmp7 = add i64 %tmp6, %tmp5
+  store i64 %tmp7, i64 addrspace(1)* %arg1, align 8
+  br label %bb8
+
+bb8:                                              ; preds = %bb3, %bb
+  ret void
+}
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }