[llvm] ca8b20c - [AMDGPU] need to insert wait between the scalar load and vector store to the same address to avoid WAR conflict.
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 4 07:25:55 PST 2020
Author: alex-t
Date: 2020-01-04T18:23:14+03:00
New Revision: ca8b20ca3ba10288b61a083c4ce57fb011124935
URL: https://github.com/llvm/llvm-project/commit/ca8b20ca3ba10288b61a083c4ce57fb011124935
DIFF: https://github.com/llvm/llvm-project/commit/ca8b20ca3ba10288b61a083c4ce57fb011124935.diff
LOG: [AMDGPU] need to insert wait between the scalar load and vector store to the same address to avoid WAR conflict.
Reviewers: rampitec, vpykhtin, nhaehnle
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D71934
Added:
llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 927826c52404..ef662d55cb0a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -42,7 +42,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
@@ -372,6 +374,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
AMDGPU::IsaVersion IV;
DenseSet<MachineInstr *> TrackedWaitcntSet;
+ DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+ MachinePostDominatorTree *PDT;
struct BlockInfo {
MachineBasicBlock *MBB;
@@ -406,6 +410,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -792,6 +797,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
@@ -1012,6 +1018,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (MI.mayStore()) {
// FIXME: Should not be relying on memoperands.
for (const MachineMemOperand *Memop : MI.memoperands()) {
+ const Value *Ptr = Memop->getValue();
+ if (SLoadAddresses.count(Ptr)) {
+ addWait(Wait, LGKM_CNT, 0);
+ if (PDT->dominates(MI.getParent(),
+ SLoadAddresses.find(Ptr)->second))
+ SLoadAddresses.erase(Ptr);
+ }
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
@@ -1399,6 +1412,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
}
+ if (TII->isSMRD(Inst)) {
+ for (const MachineMemOperand *Memop : Inst.memoperands()) {
+ const Value *Ptr = Memop->getValue();
+ SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+ }
+ }
+
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1448,6 +1468,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
for (auto T : inst_counter_types())
diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
new file mode 100644
index 000000000000..4ba16b4eb30b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+
+; GCN-LABEL: BB0_1
+; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off
+
+define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
+bb:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp2 = icmp eq i32 %tmp, 0
+ br i1 %tmp2, label %bb3, label %bb8
+
+bb3: ; preds = %bb
+ %tmp4 = load i32, i32 addrspace(1)* %arg, align 4
+ store i32 0, i32 addrspace(1)* %arg, align 4
+ %tmp5 = zext i32 %tmp4 to i64
+ %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8
+ %tmp7 = add i64 %tmp6, %tmp5
+ store i64 %tmp7, i64 addrspace(1)* %arg1, align 8
+ br label %bb8
+
+bb8: ; preds = %bb3, %bb
+ ret void
+}
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
More information about the llvm-commits
mailing list