[llvm] 124acb7 - [AMDGPU] Fix negative offset values interpretation in getMemOperandsWithOffset for DS

Wed Apr 26 06:10:31 PDT 2023

Author: Janek van Oirschot
Date: 2023-04-26T14:10:25+01:00
New Revision: 124acb7ca31aa358cc5f7b4a5d09edbd1c0cac84

URL: https://github.com/llvm/llvm-project/commit/124acb7ca31aa358cc5f7b4a5d09edbd1c0cac84
DIFF: https://github.com/llvm/llvm-project/commit/124acb7ca31aa358cc5f7b4a5d09edbd1c0cac84.diff

LOG: [AMDGPU] Fix negative offset values interpretation in getMemOperandsWithOffset for DS

The offset values may result in an erroneous scheduling of a load before write for a memory location if the offset values are represented as negative values in MIR, despite actually being unsigned values. This representation in MIR happens as SelectionDAG::getConstant could go through APInt to represent the encoding which assumes the MSB of the encoding as a sign-bit, regardless of whether it is supposed to be a signed value. The 8-bit negative (interpreted) value gets cast to an unsigned 32 bit value in getMemOperandsWithOffset used for comparisons in areMemAccessesTriviallyDisjoint eventually leading to an erroneous schedule in the machine scheduler.

Reviewed By: arsenm, foad

Differential Revision: https://reviews.llvm.org/D149080

Added: 
    llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 25aecfddb7cb..055441d2488b 100644

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -329,8 +329,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
       const MachineOperand *Offset1Op =
           getNamedOperand(LdSt, AMDGPU::OpName::offset1);
 
-      unsigned Offset0 = Offset0Op->getImm();
-      unsigned Offset1 = Offset1Op->getImm();
+      unsigned Offset0 = Offset0Op->getImm() & 0xff;
+      unsigned Offset1 = Offset1Op->getImm() & 0xff;
       if (Offset0 + 1 != Offset1)
         return false;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index e0ccbcda7fa0..4d73e438f617 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,10 +1,25 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
+%struct.lds = type { [64 x ptr], [16 x i8] }
+ at stored_lds_struct = addrspace(3) global %struct.lds addrspace(3) *undef, align 16
 @stored_lds_ptr = addrspace(3) global ptr addrspace(3) undef, align 4
 @stored_constant_ptr = addrspace(3) global ptr addrspace(4) undef, align 8
 @stored_global_ptr = addrspace(3) global ptr addrspace(1) undef, align 8
 
+; GCN-LABEL: {{^}}no_reorder_flat_load_local_store_local_load:
+; GCN: flat_load_dwordx4
+; GCN: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:512
+; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:129 offset1:130
+define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 {
+  %ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1
+  %ptr2 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1, i32 4
+  call void @llvm.memcpy.p3.p0(ptr addrspace(3) align 16 %ptr1, ptr align 8 %fptr, i64 16, i1 false)
+  %vector_load = load <2 x i32>, ptr addrspace(3) %ptr2, align 4
+  store <2 x i32> %vector_load, ptr addrspace(3) %out, align 4
+  ret void
+}
+
 ; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load:
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
@@ -312,6 +327,7 @@ define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(ptr addrspace
   ret void
 }
 
+declare void @llvm.memcpy.p3.p0(ptr addrspace(3), ptr, i64, i1)
 declare void @llvm.amdgcn.s.barrier() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3

diff  --git a/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir b/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir
new file mode 100644
index 000000000000..80337069a462
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir
@@ -0,0 +1,37 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched --debug-only=machine-scheduler -run-pass=machine-scheduler -o /dev/null %s 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Make sure handling of unsigned immediate values interpreted as negative values
+# still works for SIInstrInfo::areMemAccessesTriviallyDisjoint.
+
+# LABEL: {{^}}no_reorder_flat_load_local_store_local_load:
+# CHECK:      SU(5):   %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK:      SU(6):   DS_WRITE_B128_gfx9 %5:vgpr_32, %4:vreg_128, 512, 0, implicit $exec
+# CHECK:      SU(7):   %6:vreg_64 = DS_READ2_B32_gfx9 %5:vgpr_32, -127, -126, 0, implicit $exec
+# CHECK:      Predecessors:
+# CHECK-NEXT: SU(6): Ord
+# CHECK-NEXT: SU(5): Data
+---
+name:            no_reorder_flat_load_local_store_local_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %4:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 36, 0
+    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 44, 0
+    %7:vreg_64 = COPY %5
+    %6:vreg_128 = FLAT_LOAD_DWORDX4 killed %7, 0, 0, implicit $exec, implicit $flat_scr
+    %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    DS_WRITE_B128_gfx9 %8, killed %6, 512, 0, implicit $exec
+    %9:vreg_64 = DS_READ2_B32_gfx9 %8, -127, -126, 0, implicit $exec
+    %10:sreg_32 = COPY %9.sub1
+    %11:sreg_32 = COPY %9.sub0
+    %12:vgpr_32 = COPY %4
+    %13:vgpr_32 = COPY %11
+    %14:vgpr_32 = COPY %10
+    DS_WRITE2_B32_gfx9 killed %12, killed %13, killed %14, 0, 1, 0, implicit $exec
+    S_ENDPGM 0
+
+...