[PATCH] D43053: AMDGPU: Fix incorrect reordering when inline asm defines LDS address
Matt Arsenault via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 7 16:55:31 PST 2018
arsenm created this revision.
arsenm added a reviewer: rampitec.
Herald added subscribers: t-tye, tpr, dstuttard, yaxunl, nhaehnle, wdng, kzhuravl.
Defs of operands outside of the instruction's explicit defs need
to be checked.
https://reviews.llvm.org/D43053
Files:
lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
test/CodeGen/AMDGPU/merge-load-store.mir
Index: test/CodeGen/AMDGPU/merge-load-store.mir
===================================================================
--- test/CodeGen/AMDGPU/merge-load-store.mir
+++ test/CodeGen/AMDGPU/merge-load-store.mir
@@ -24,6 +24,41 @@
store i32 %4, i32 addrspace(3)* %ptr.0
ret void
}
+
+ @lds0 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+ @lds1 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+ @lds2 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+ @lds3 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
+
+ define void @asm_defines_address() #0 {
+ bb:
+ %tmp1 = load i32, i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0), align 4
+ %0 = and i32 %tmp1, 255
+ %tmp3 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef), align 4
+ %tmp6 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef), align 4
+ %tmp7 = tail call i32 asm "v_or_b32 $0, 0, $1", "=v,v"(i32 %tmp6) #1
+ %tmp10 = lshr i32 %tmp7, 16
+ %tmp11 = and i32 %tmp10, 255
+ %tmp12 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp11
+ %tmp13 = load i32, i32 addrspace(3)* %tmp12, align 4
+ %tmp14 = xor i32 %tmp3, %tmp13
+ %tmp15 = lshr i32 %tmp14, 8
+ %tmp16 = and i32 %tmp15, 16711680
+ %tmp19 = lshr i32 %tmp16, 16
+ %tmp20 = and i32 %tmp19, 255
+ %tmp21 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp20
+ %tmp22 = load i32, i32 addrspace(3)* %tmp21, align 4
+ %tmp24 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef), align 4
+ %tmp25 = xor i32 %tmp22, %tmp24
+ %tmp26 = and i32 %tmp25, -16777216
+ %tmp28 = or i32 %0, %tmp26
+ store volatile i32 %tmp28, i32 addrspace(1)* undef
+ ret void
+ }
+
+ attributes #0 = { convergent nounwind }
+ attributes #1 = { convergent nounwind readnone }
+
...
---
name: mem_dependency
@@ -68,3 +103,29 @@
S_ENDPGM
...
+---
+# Make sure the asm def isn't moved after the point where it's used for
+# the address.
+# CHECK-LABEL: name: asm_defines_address
+# CHECK: DS_READ2ST64_B32
+# CHECK: DS_READ2ST64_B32
+# CHECK: INLINEASM
+# CHECK: DS_READ_B32
+# CHECK: DS_READ_B32
+name: asm_defines_address
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '' }
+body: |
+ bb.0:
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %2:vgpr_32 = DS_READ_B32 %1, 3072, 0, implicit $m0, implicit $exec :: (dereferenceable load 4 from `i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0)`, addrspace 3)
+ %3:vgpr_32 = DS_READ_B32 %1, 2048, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef)`, addrspace 3)
+ %4:vgpr_32 = DS_READ_B32 %1, 1024, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef)`, addrspace 3)
+ INLINEASM &"v_or_b32 $0, 0, $1", 32, 327690, def %0, 327689, %4
+ %5:vgpr_32 = DS_READ_B32 %0, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 3)
+ %6:vgpr_32 = DS_READ_B32 %5, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp21, addrspace 3)
+ %7:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef)`, addrspace 3)
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7
+
+...
Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -174,9 +174,10 @@
}
static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
- // XXX: Should this be looking for implicit defs?
- for (const MachineOperand &Def : MI.defs())
- Defs.insert(Def.getReg());
+ for (const MachineOperand &Def : MI.operands()) {
+ if (Def.isReg() && Def.isDef())
+ Defs.insert(Def.getReg());
+ }
}
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D43053.133339.patch
Type: text/x-patch
Size: 4541 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180208/e1291bd0/attachment.bin>
More information about the llvm-commits
mailing list