[PATCH] D43053: AMDGPU: Fix incorrect reordering when inline asm defines LDS address

Matt Arsenault via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 7 16:55:31 PST 2018


arsenm created this revision.
arsenm added a reviewer: rampitec.
Herald added subscribers: t-tye, tpr, dstuttard, yaxunl, nhaehnle, wdng, kzhuravl.

Defs of operands outside of the instruction's explicit defs need
to be checked.


https://reviews.llvm.org/D43053

Files:
  lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
  test/CodeGen/AMDGPU/merge-load-store.mir


Index: test/CodeGen/AMDGPU/merge-load-store.mir
===================================================================
--- test/CodeGen/AMDGPU/merge-load-store.mir
+++ test/CodeGen/AMDGPU/merge-load-store.mir
@@ -24,6 +24,41 @@
     store i32 %4, i32 addrspace(3)* %ptr.0
     ret void
   }
+
+  @lds0 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+  @lds1 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+  @lds2 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+  @lds3 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+
+  define void @asm_defines_address() #0 {
+  bb:
+    %tmp1 = load i32, i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0), align 4
+    %0 = and i32 %tmp1, 255
+    %tmp3 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef), align 4
+    %tmp6 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef), align 4
+    %tmp7 = tail call i32 asm "v_or_b32 $0, 0, $1", "=v,v"(i32 %tmp6) #1
+    %tmp10 = lshr i32 %tmp7, 16
+    %tmp11 = and i32 %tmp10, 255
+    %tmp12 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp11
+    %tmp13 = load i32, i32 addrspace(3)* %tmp12, align 4
+    %tmp14 = xor i32 %tmp3, %tmp13
+    %tmp15 = lshr i32 %tmp14, 8
+    %tmp16 = and i32 %tmp15, 16711680
+    %tmp19 = lshr i32 %tmp16, 16
+    %tmp20 = and i32 %tmp19, 255
+    %tmp21 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp20
+    %tmp22 = load i32, i32 addrspace(3)* %tmp21, align 4
+    %tmp24 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef), align 4
+    %tmp25 = xor i32 %tmp22, %tmp24
+    %tmp26 = and i32 %tmp25, -16777216
+    %tmp28 = or i32 %0, %tmp26
+    store volatile i32 %tmp28, i32 addrspace(1)* undef
+    ret void
+  }
+
+  attributes #0 = { convergent nounwind }
+  attributes #1 = { convergent nounwind readnone }
+
 ...
 ---
 name:            mem_dependency
@@ -68,3 +103,29 @@
     S_ENDPGM
 
 ...
+---
+# Make sure the asm def isn't moved after the point where it's used for
+# the address.
+# CHECK-LABEL: name: asm_defines_address
+# CHECK: DS_READ2ST64_B32
+# CHECK: DS_READ2ST64_B32
+# CHECK: INLINEASM
+# CHECK: DS_READ_B32
+# CHECK: DS_READ_B32
+name:            asm_defines_address
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+body:             |
+  bb.0:
+    %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %2:vgpr_32 = DS_READ_B32 %1, 3072, 0, implicit $m0, implicit $exec :: (dereferenceable load 4 from `i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0)`, addrspace 3)
+    %3:vgpr_32 = DS_READ_B32 %1, 2048, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef)`, addrspace 3)
+    %4:vgpr_32 = DS_READ_B32 %1, 1024, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef)`, addrspace 3)
+    INLINEASM &"v_or_b32 $0, 0, $1", 32, 327690, def %0, 327689, %4
+    %5:vgpr_32 = DS_READ_B32 %0, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 3)
+    %6:vgpr_32 = DS_READ_B32 %5, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp21, addrspace 3)
+    %7:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef)`, addrspace 3)
+    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7
+
+...
Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -174,9 +174,10 @@
 }
 
 static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
-  // XXX: Should this be looking for implicit defs?
-  for (const MachineOperand &Def : MI.defs())
-    Defs.insert(Def.getReg());
+  for (const MachineOperand &Def : MI.operands()) {
+    if (Def.isReg() && Def.isDef())
+      Defs.insert(Def.getReg());
+  }
 }
 
 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D43053.133339.patch
Type: text/x-patch
Size: 4541 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180208/e1291bd0/attachment.bin>


More information about the llvm-commits mailing list