[PATCH] MISched: Fix moving stores across barriers

Tue Dec 2 12:32:54 PST 2014

This fixes an issue with ScheduleDAGInstrs::buildSchedGraph
where stores without an underlying object would not be added
as a predecessor to the current BarrierChain.
---
 lib/CodeGen/ScheduleDAGInstrs.cpp  |  9 +++++--
 test/CodeGen/R600/store-barrier.ll | 52 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/R600/store-barrier.ll

diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index d8d8422..ee8b5c2 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -794,6 +794,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
        MII != MIE; --MII) {
     MachineInstr *MI = std::prev(MII);
+
     if (MI && DbgMI) {
       DbgValues.push_back(std::make_pair(DbgMI, MI));
       DbgMI = nullptr;
@@ -920,6 +921,12 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       AliasMemDefs.clear();
       AliasMemUses.clear();
     } else if (MI->mayStore()) {
+      // Add dependence on barrier chain, if needed.
+      // There is no point to check aliasing on barrier event. Even if
+      // SU and barrier _could_ be reordered, they should not. In addition,
+      // we have lost all RejectMemNodes below barrier.
+      if (BarrierChain)
+        BarrierChain->addPred(SDep(SU, SDep::Barrier));
       UnderlyingObjectsVector Objs;
       getUnderlyingObjectsForInstr(MI, MFI, Objs);
 
@@ -993,8 +1000,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       // There is no point to check aliasing on barrier event. Even if
       // SU and barrier _could_ be reordered, they should not. In addition,
       // we have lost all RejectMemNodes below barrier.
-      if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Barrier));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {
diff --git a/test/CodeGen/R600/store-barrier.ll b/test/CodeGen/R600/store-barrier.ll
new file mode 100644
index 0000000..229cd8f
--- /dev/null
+++ b/test/CodeGen/R600/store-barrier.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+
+; This test is for a bug in the machine scheduler where stores without
+; an underlying object would be moved across the barrier.  In this
+; test, the <2 x i8> store will be split into two i8 stores, so they
+; won't have an underlying object.
+
+; CHECK-LABEL: {{^}}test:
+; CHECK: ds_write_b8
+; CHECK: ds_write_b8
+; CHECK: s_barrier
+; Function Attrs: nounwind
+define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
+bb:
+  %tmp10 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp9
+  %tmp13 = load i32 addrspace(1)* %tmp10, align 2
+  %tmp14 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp13
+  %tmp15 = load <2 x i8> addrspace(3)* %tmp14, align 2
+  %tmp16 = add i32 %tmp13, 1
+  %tmp17 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp16
+  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
+  tail call void @llvm.AMDGPU.barrier.local() #2
+  %tmp25 = load i32 addrspace(1)* %tmp10, align 4
+  %tmp26 = sext i32 %tmp25 to i64
+  %tmp27 = sext i32 %arg4 to i64
+  %tmp28 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
+  %tmp29 = load i8 addrspace(3)* %tmp28, align 1
+  %tmp30 = getelementptr inbounds <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
+  store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1
+  %tmp32 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
+  %tmp33 = load i8 addrspace(3)* %tmp32, align 1
+  %tmp35 = getelementptr inbounds <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
+  store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<2 x i8> addrspace(3)*, <2 x i8> addrspace(1)*, i32 addrspace(1)*, <2 x i8> addrspace(1)*, i32, i64)* @test}
+!3 = metadata !{metadata !4, metadata !4, i64 0}
+!4 = metadata !{metadata !"int", metadata !5, i64 0}
+!5 = metadata !{metadata !"omnipotent char", metadata !6, i64 0}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+!7 = metadata !{metadata !5, metadata !5, i64 0}
-- 
2.0.4