[PATCH] D11587: AMDGPU: Assume SMRD access for constant address space

Fri Aug 7 06:31:51 PDT 2015

On Fri, Aug 07, 2015 at 12:32:21AM +0000, Matt Arsenault via llvm-commits wrote:
> arsenm updated this revision to Diff 31498.
> arsenm added a comment.
> 
> Fix tests since CI 32-bit immediates now work
> 

LGTM.
> 
> http://reviews.llvm.org/D11587
> 
> Files:
>   lib/Target/AMDGPU/SIISelLowering.cpp
>   lib/Target/AMDGPU/SIISelLowering.h
>   test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
>   test/CodeGen/AMDGPU/cgp-addressing-modes.ll
>   test/CodeGen/AMDGPU/salu-to-valu.ll
> 

> Index: test/CodeGen/AMDGPU/salu-to-valu.ll
> ===================================================================
> --- test/CodeGen/AMDGPU/salu-to-valu.ll
> +++ test/CodeGen/AMDGPU/salu-to-valu.ll
> @@ -75,10 +75,10 @@
>    ret void
>  }
>  
> -; Test moving ann SMRD with an immediate offset to the VALU
> +; Test moving an SMRD with an immediate offset to the VALU
>  
>  ; CHECK-LABEL: {{^}}smrd_valu2:
> -; CHECK: buffer_load_dword
> +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
>  define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
>  entry:
>    %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> @@ -89,6 +89,34 @@
>    ret void
>  }
>  
> +; CHECK-LABEL: {{^}}smrd_valu2_max_smrd_offset:
> +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
> +define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) {
> +entry:
> +  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %1 = add i32 %0, 4
> +  %2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %0, i32 255
> +  %3 = load i32, i32 addrspace(2)* %2
> +  store i32 %3, i32 addrspace(1)* %out
> +  ret void
> +}
> +
> +; Offset is too big to fit in SMRD 8-bit offset, but small enough to
> +; fit in MUBUF offset.
> +; FIXME: We should be using the offset but we don't
> +
> +; CHECK-LABEL: {{^}}smrd_valu2_mubuf_offset:
> +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
> +define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) {
> +entry:
> +  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %1 = add i32 %0, 4
> +  %2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %0, i32 256
> +  %3 = load i32, i32 addrspace(2)* %2
> +  store i32 %3, i32 addrspace(1)* %out
> +  ret void
> +}
> +
>  ; CHECK-LABEL: {{^}}s_load_imm_v8i32:
>  ; CHECK: buffer_load_dwordx4
>  ; CHECK: buffer_load_dwordx4
> Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> ===================================================================
> --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> @@ -1,5 +1,7 @@
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
>  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
>  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
> +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
>  ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
>  ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
>  
> @@ -115,35 +117,6 @@
>    ret void
>  }
>  
> -; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
> -; OPT: getelementptr i32, i32 addrspace(4)* %in
> -; OPT: br i1
> -; OPT-NOT: ptrtoint
> -
> -; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
> -; GCN: flat_load_dword
> -; GCN: {{^}}BB4_2:
> -
> -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
> -entry:
> -  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
> -  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
> -  %tmp0 = icmp eq i32 %cond, 0
> -  br i1 %tmp0, label %endif, label %if
> -
> -if:
> -  %tmp1 = load i32, i32 addrspace(4)* %in.gep
> -  br label %endif
> -
> -endif:
> -  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> -  store i32 %x, i32 addrspace(4)* %out.gep
> -  br label %done
> -
> -done:
> -  ret void
> -}
> -
>  ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
>  ; OPT-NOT:  getelementptr [512 x i32]
>  ; OPT: br i1
> @@ -153,7 +126,7 @@
>  ; GCN: s_and_saveexec_b64
>  ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
>  ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
> -; GCN: {{^}}BB5_2:
> +; GCN: {{^}}BB4_2:
>  define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
>  entry:
>    %alloca = alloca [512 x i32], align 4
> @@ -189,7 +162,7 @@
>  ; GCN: s_and_saveexec_b64
>  ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
>  ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
> -; GCN: {{^}}BB6_2:
> +; GCN: {{^}}BB5_2:
>  define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
>  entry:
>    %alloca = alloca [512 x i32], align 4
> @@ -222,7 +195,7 @@
>  ; GCN: s_and_saveexec_b64
>  ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
>  ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
> -; GCN: {{^}}BB7_2:
> +; GCN: {{^}}BB6_2:
>  define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
>  entry:
>    %offset.ext = zext i32 %offset to i64
> @@ -246,3 +219,222 @@
>  
>  attributes #0 = { nounwind readnone }
>  attributes #1 = { nounwind }
> +
> +
> +
> +; OPT-LABEL: @test_sink_constant_small_offset_i32
> +; OPT-NOT:  getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
> +; OPT-NOT:  getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
> +; OPT-SI:  getelementptr i32, i32 addrspace(2)*
> +; OPT-CI-NOT:  getelementptr i32, i32 addrspace(2)*
> +; OPT-VI-NOT:  getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
> +
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
> +; OPT-SI: getelementptr i32, i32 addrspace(2)*
> +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 3{{$}}
> +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -4{{$}}
> +; GCN: s_add_u32
> +; GCN: s_addc_u32
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
> +; OPT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN: s_add_u32
> +; GCN: s_addc_u32
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +
> +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
> +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
> +
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
> +; OPT-SI: getelementptr i32, i32 addrspace(2)*
> +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT-VI: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +
> +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
> +
> +; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
> +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(2)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
> @@ -0,0 +1,32 @@
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
> +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
> +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
> +
> +; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
> +; OPT: getelementptr i32, i32 addrspace(4)* %in
> +; OPT: br i1
> +; OPT-NOT: ptrtoint
> +
> +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
> +; GCN: flat_load_dword
> +; GCN: {{^}}BB0_2:
> +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(4)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(4)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> Index: lib/Target/AMDGPU/SIISelLowering.h
> ===================================================================
> --- lib/Target/AMDGPU/SIISelLowering.h
> +++ lib/Target/AMDGPU/SIISelLowering.h
> @@ -57,6 +57,7 @@
>    SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
>  
>    bool isLegalFlatAddressingMode(const AddrMode &AM) const;
> +  bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
>  public:
>    SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
>  
> Index: lib/Target/AMDGPU/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/AMDGPU/SIISelLowering.cpp
> +++ lib/Target/AMDGPU/SIISelLowering.cpp
> @@ -261,15 +261,50 @@
>    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
>  }
>  
> +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
> +  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
> +  // additionally can do r + r + i with addr64. 32-bit has more addressing
> +  // mode options. Depending on the resource constant, it can also do
> +  // (i64 r0) + (i32 r1) * (i14 i).
> +  //
> +  // Private arrays end up using a scratch buffer most of the time, so also
> +  // assume those use MUBUF instructions. Scratch loads / stores are currently
> +  // implemented as mubuf instructions with offen bit set, so slightly
> +  // different than the normal addr64.
> +  if (!isUInt<12>(AM.BaseOffs))
> +    return false;
> +
> +  // FIXME: Since we can split immediate into soffset and immediate offset,
> +  // would it make sense to allow any immediate?
> +
> +  switch (AM.Scale) {
> +  case 0: // r + i or just i, depending on HasBaseReg.
> +    return true;
> +  case 1:
> +    return true; // We have r + r or r + i.
> +  case 2:
> +    if (AM.HasBaseReg) {
> +      // Reject 2 * r + r.
> +      return false;
> +    }
> +
> +    // Allow 2 * r as r + r
> +    // Or  2 * r + i is allowed as r + r + i.
> +    return true;
> +  default: // Don't allow n * r
> +    return false;
> +  }
> +}
> +
>  bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
>                                               const AddrMode &AM, Type *Ty,
>                                               unsigned AS) const {
>    // No global is ever allowed as a base.
>    if (AM.BaseGV)
>      return false;
>  
>    switch (AS) {
> -  case AMDGPUAS::GLOBAL_ADDRESS:
> +  case AMDGPUAS::GLOBAL_ADDRESS: {
>      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
>        // Assume the we will use FLAT for all global memory accesses
>        // on VI.
> @@ -282,51 +317,51 @@
>        // because it has never been validated.
>        return isLegalFlatAddressingMode(AM);
>      }
> -    // fall-through
> -  case AMDGPUAS::PRIVATE_ADDRESS:
> -  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
> -  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
> -    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
> -    // additionally can do r + r + i with addr64. 32-bit has more addressing
> -    // mode options. Depending on the resource constant, it can also do
> -    // (i64 r0) + (i32 r1) * (i14 i).
> -    //
> -    // SMRD instructions have an 8-bit, dword offset.
> -    //
> -    // Assume nonunifom access, since the address space isn't enough to know
> -    // what instruction we will use, and since we don't know if this is a load
> -    // or store and scalar stores are only available on VI.
> -    //
> -    // We also know if we are doing an extload, we can't do a scalar load.
> -    //
> -    // Private arrays end up using a scratch buffer most of the time, so also
> -    // assume those use MUBUF instructions. Scratch loads / stores are currently
> -    // implemented as mubuf instructions with offen bit set, so slightly
> -    // different than the normal addr64.
> -    if (!isUInt<12>(AM.BaseOffs))
> -      return false;
>  
> -    // FIXME: Since we can split immediate into soffset and immediate offset,
> -    // would it make sense to allow any immediate?
> +    return isLegalMUBUFAddressingMode(AM);
> +  }
> +  case AMDGPUAS::CONSTANT_ADDRESS: {
> +    // If the offset isn't a multiple of 4, it probably isn't going to be
> +    // correctly aligned.
> +    if (AM.BaseOffs % 4 != 0)
> +      return isLegalMUBUFAddressingMode(AM);
> +
> +    // There are no SMRD extloads, so if we have to do a small type access we
> +    // will use a MUBUF load.
> +    // FIXME?: We also need to do this if unaligned, but we don't know the
> +    // alignment here.
> +    if (DL.getTypeStoreSize(Ty) < 4)
> +      return isLegalMUBUFAddressingMode(AM);
> +
> +    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
> +      // SMRD instructions have an 8-bit, dword offset on SI.
> +      if (!isUInt<8>(AM.BaseOffs / 4))
> +        return false;
> +    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
> +      // On CI+, this can also be a 32-bit literal constant offset. If it fits
> +      // in 8-bits, it can use a smaller encoding.
> +      if (!isUInt<32>(AM.BaseOffs / 4))
> +        return false;
> +    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
> +      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
> +      if (!isUInt<20>(AM.BaseOffs))
> +        return false;
> +    } else
> +      llvm_unreachable("unhandled generation");
>  
> -    switch (AM.Scale) {
> -    case 0: // r + i or just i, depending on HasBaseReg.
> +    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
>        return true;
> -    case 1:
> -      return true; // We have r + r or r + i.
> -    case 2:
> -      if (AM.HasBaseReg) {
> -        // Reject 2 * r + r.
> -        return false;
> -      }
>  
> -      // Allow 2 * r as r + r
> -      // Or  2 * r + i is allowed as r + r + i.
> +    if (AM.Scale == 1 && AM.HasBaseReg)
>        return true;
> -    default: // Don't allow n * r
> -      return false;
> -    }
> +
> +    return false;
>    }
> +
> +  case AMDGPUAS::PRIVATE_ADDRESS:
> +  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
> +    return isLegalMUBUFAddressingMode(AM);
> +
>    case AMDGPUAS::LOCAL_ADDRESS:
>    case AMDGPUAS::REGION_ADDRESS: {
>      // Basic, single offset DS instructions allow a 16-bit unsigned immediate

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits