[llvm] r285273 - AMDGPU: Fix SILoadStoreOptimizer when writes cannot be merged due register dependencies

Thu Oct 27 01:15:07 PDT 2016

Author: nha
Date: Thu Oct 27 03:15:07 2016
New Revision: 285273

URL: http://llvm.org/viewvc/llvm-project?rev=285273&view=rev
Log:
AMDGPU: Fix SILoadStoreOptimizer when writes cannot be merged due register dependencies

Summary:
When finding a match for a merge and collecting the instructions that must
be moved, keep in mind that the instruction we merge might actually use one
of the defs that are being moved.

Fixes piglit spec/arb_enhanced_layouts/execution/component-layout/vs-tcs-load-output[-indirect].

The fact that the ds_read in the test case is not eliminated suggests that
there might be another problem related to alias analysis, but that's a
separate problem: this pass should still work correctly even when earlier
optimization passes missed something or were disabled.

Reviewers: tstellarAMD, arsenm

Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye

Differential Revision: https://reviews.llvm.org/D25829

Added:
    llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=285273&r1=285272&r2=285273&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Thu Oct 27 03:15:07 2016
@@ -141,6 +141,27 @@ static void addDefsToList(const MachineI
   }
 }
 
+// Add MI and its defs to the lists if MI reads one of the defs that are
+// already in the list. Returns true in that case.
+static bool
+addToListsIfDependent(MachineInstr &MI,
+                      SmallVectorImpl<const MachineOperand *> &Defs,
+                      SmallVectorImpl<MachineInstr*> &Insts) {
+  for (const MachineOperand *Def : Defs) {
+    bool ReadDef = MI.readsVirtualRegister(Def->getReg());
+    // If ReadDef is true, then there is a use of Def between I
+    // and the instruction that I will potentially be merged with. We
+    // will need to move this instruction after the merged instructions.
+    if (ReadDef) {
+      Insts.push_back(&MI);
+      addDefsToList(MI, Defs);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static bool
 canMoveInstsAcrossMemOp(MachineInstr &MemOp,
                         ArrayRef<MachineInstr*> InstsToMove,
@@ -224,17 +245,7 @@ SILoadStoreOptimizer::findMatchingDSInst
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      for (const MachineOperand *Def : DefsToMove) {
-        bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
-        // If ReadDef is true, then there is a use of Def between I
-        // and the instruction that I will potentially be merged with. We
-        // will need to move this instruction after the merged instructions.
-        if (ReadDef) {
-          InstsToMove.push_back(&*MBBI);
-          addDefsToList(*MBBI, DefsToMove);
-          break;
-        }
-      }
+      addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
       continue;
     }
 
@@ -242,6 +253,15 @@ SILoadStoreOptimizer::findMatchingDSInst
     if (MBBI->hasOrderedMemoryRef())
       return E;
 
+    // Handle a case like
+    //   DS_WRITE_B32 addr, v, idx0
+    //   w = DS_READ_B32 addr, idx0
+    //   DS_WRITE_B32 addr, f(w), idx1
+    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
+    // merging of the two writes.
+    if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+      continue;
+
     int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
     const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
     const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);

Added: llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll?rev=285273&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll Thu Oct 27 03:15:07 2016
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: ds_write_b32
+; CHECK: ds_read_b32
+; CHECK: ds_write_b32
+define amdgpu_vs void @test1(i32 %v) #0 {
+  %p0 = getelementptr i32, i32 addrspace(3)* null, i32 0
+  %p1 = getelementptr i32, i32 addrspace(3)* null, i32 1
+
+  store i32 %v, i32 addrspace(3)* %p0
+
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+
+  %w = load i32, i32 addrspace(3)* %p0
+  store i32 %w, i32 addrspace(3)* %p1
+  ret void
+}
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind }