[llvm] r285919 - [AMDGPU][CodeGen] To improve CGEMM performance: combine LDS reads.

Alexander Timofeev via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 3 07:37:13 PDT 2016


Author: alex-t
Date: Thu Nov  3 09:37:13 2016
New Revision: 285919

URL: http://llvm.org/viewvc/llvm-project?rev=285919&view=rev
Log:
[AMDGPU][CodeGen] To improve CGEMM performance: combine LDS reads.

hange explores the fact that LDS reads may be reordered even if access
the same location.

Prior the change, algorithm immediately stops as soon as any memory
access encountered between loads that are expected to be merged
together. Although, Read-After-Read conflict cannot affect execution
correctness.

Improves hcBLAS CGEMM manually loop-unrolled kernels performance by 44%.
Also improvement expected on any massive sequences of reads from LDS.

Differential Revision: https://reviews.llvm.org/D25944

Modified:
    llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=285919&r1=285918&r2=285919&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Thu Nov  3 09:37:13 2016
@@ -141,6 +141,18 @@ static void addDefsToList(const MachineI
   }
 }
 
+static bool memAccessesCanBeReordered(
+  MachineBasicBlock::iterator A,
+  MachineBasicBlock::iterator B,
+  const SIInstrInfo *TII,
+  llvm::AliasAnalysis * AA) {
+  return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+    // RAW or WAR - cannot reorder
+    // WAW - cannot reorder
+    // RAR - safe to reorder
+    !(A->mayStore() || B->mayStore()));
+}
+
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
 static bool
@@ -173,8 +185,8 @@ canMoveInstsAcrossMemOp(MachineInstr &Me
   for (MachineInstr *InstToMove : InstsToMove) {
     if (!InstToMove->mayLoadOrStore())
       continue;
-    if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
-      return false;
+    if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+        return false;
   }
   return true;
 }
@@ -233,7 +245,7 @@ SILoadStoreOptimizer::findMatchingDSInst
         return E;
 
       if (MBBI->mayLoadOrStore() &&
-          !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
@@ -288,8 +300,10 @@ SILoadStoreOptimizer::findMatchingDSInst
     // We could potentially keep looking, but we'd need to make sure that
     // it was safe to move I and also all the instruction in InstsToMove
     // down past this instruction.
-    // FIXME: This is too conservative.
-    break;
+    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
+      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+     )
+      break;
   }
   return E;
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll?rev=285919&r1=285918&r2=285919&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll Thu Nov  3 09:37:13 2016
@@ -493,6 +493,46 @@ define void @misaligned_read2_i64(i64 ad
   ret void
 }
 
+; SI-LABEL: ds_read_diff_base_interleaving
+; SI-NOT: ds_read_b32
+define amdgpu_kernel void @ds_read_diff_base_interleaving(
+  float addrspace(1)* nocapture %arg,
+  [4 x [4 x float]] addrspace(3)* %arg1,
+  [4 x [4 x float]] addrspace(3)* %arg2,
+  [4 x [4 x float]] addrspace(3)* %arg3,
+  [4 x [4 x float]] addrspace(3)* %arg4) #1 {
+bb:
+  %tmp = getelementptr float, float addrspace(1)* %arg, i64 10
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0
+  %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5
+  %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0
+  %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5
+  %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1
+  %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5
+  %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1
+  %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5
+  %tmp15 = load float, float addrspace(3)* %tmp7
+  %tmp16 = load float, float addrspace(3)* %tmp8
+  %tmp17 = fmul float %tmp15, %tmp16
+  %tmp18 = fadd float 2.000000e+00, %tmp17
+  %tmp19 = load float, float addrspace(3)* %tmp9
+  %tmp20 = load float, float addrspace(3)* %tmp10
+  %tmp21 = fmul float %tmp19, %tmp20
+  %tmp22 = fsub float %tmp18, %tmp21
+  %tmp23 = load float, float addrspace(3)* %tmp11
+  %tmp24 = load float, float addrspace(3)* %tmp12
+  %tmp25 = fmul float %tmp23, %tmp24
+  %tmp26 = fsub float %tmp22, %tmp25
+  %tmp27 = load float, float addrspace(3)* %tmp13
+  %tmp28 = load float, float addrspace(3)* %tmp14
+  %tmp29 = fmul float %tmp27, %tmp28
+  %tmp30 = fsub float %tmp26, %tmp29
+  store float %tmp30, float addrspace(1)* %tmp
+  ret void
+}
+
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 




More information about the llvm-commits mailing list