[llvm] f78687d - AMDGPU: Don't assert on misaligned DS read2/write2 offsets

Wed Aug 26 11:08:12 PDT 2020

Author: Matt Arsenault
Date: 2020-08-26T14:08:05-04:00
New Revision: f78687df9b790b4f4177a72cbd25b49d14c437b4

URL: https://github.com/llvm/llvm-project/commit/f78687df9b790b4f4177a72cbd25b49d14c437b4
DIFF: https://github.com/llvm/llvm-project/commit/f78687df9b790b4f4177a72cbd25b49d14c437b4.diff

LOG: AMDGPU: Don't assert on misaligned DS read2/write2 offsets

This would assert with unaligned DS access enabled. The offset may not
be aligned. Theoretically the pattern predicate should check the
memory alignment, although it is possible to have the memory be
aligned but not the immediate offset.

In this case I would expect it to use ds_{read|write}_b64 with
unaligned access, but am not clear if there's a reason it doesn't.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/ds_write2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2a5dac1f1e10..151b1bdd5538 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1304,9 +1304,9 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned OffsetValue0 = CAddr->getZExtValue() / Align;
     unsigned OffsetValue1 = OffsetValue0 + 1;
-    assert(Align * OffsetValue0 == CAddr->getZExtValue());
+    bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue();
 
-    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) {
+    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
       MachineSDNode *MovZero =
           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 2454efaa5e35..47ae95eefea9 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
 
 ; FIXME: We don't get cases where the address was an SGPR because we
 ; get a copy to the address register for each one.
@@ -317,7 +318,9 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out)
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-NOT: ds_read2_b32
+; CI-COUNT-4: ds_read_u8
+; GFX9-ALIGNED-4: ds_read_u8
+; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -336,7 +339,9 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-NOT: ds_read2_b32
+; CI-COUNT-2: ds_read_u16
+; GFX9-ALIGNED-2: ds_read_u16
+; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -655,6 +660,22 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrsp
   ret <2 x float> %r1
 }
 
+ at v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
+
+; GCN-LABEL: {{^}}read2_v2i32_align1_odd_offset:
+; CI-COUNT-8: ds_read_u8
+
+; GFX9-ALIGNED-COUNT-8: ds_read_u8
+
+; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}}
+; GFX9-UNALIGNED: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_ADDR]] offset1:1{{$}}
+define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) {
+entry:
+  %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 declare void @void_func_void() #3
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #1

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 6b0ce6391ca8..dce2884d77c3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -523,6 +524,21 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
   ret void
 }
 
+ at v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
+
+; GCN-LABEL: {{^}}write2_v2i32_align1_odd_offset:
+; CI-COUNT-8: ds_write_b8
+
+; GFX9-ALIGNED-COUNT-8: ds_write_b8
+
+; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}}
+; GFX9-UNALIGNED: ds_write2_b32 [[BASE_ADDR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
+entry:
+  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1