[PATCH] D11587: AMDGPU: Assume SMRD access for constant address space
Tom Stellard via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 7 06:31:51 PDT 2015
On Fri, Aug 07, 2015 at 12:32:21AM +0000, Matt Arsenault via llvm-commits wrote:
> arsenm updated this revision to Diff 31498.
> arsenm added a comment.
>
> Fix tests since CI 32-bit immediates now work
>
LGTM.
>
> http://reviews.llvm.org/D11587
>
> Files:
> lib/Target/AMDGPU/SIISelLowering.cpp
> lib/Target/AMDGPU/SIISelLowering.h
> test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
> test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> test/CodeGen/AMDGPU/salu-to-valu.ll
>
> Index: test/CodeGen/AMDGPU/salu-to-valu.ll
> ===================================================================
> --- test/CodeGen/AMDGPU/salu-to-valu.ll
> +++ test/CodeGen/AMDGPU/salu-to-valu.ll
> @@ -75,10 +75,10 @@
> ret void
> }
>
> -; Test moving ann SMRD with an immediate offset to the VALU
> +; Test moving an SMRD with an immediate offset to the VALU
>
> ; CHECK-LABEL: {{^}}smrd_valu2:
> -; CHECK: buffer_load_dword
> +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
> define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
> entry:
> %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> @@ -89,6 +89,34 @@
> ret void
> }
>
> +; CHECK-LABEL: {{^}}smrd_valu2_max_smrd_offset:
> +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
> +define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) {
> +entry:
> + %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %1 = add i32 %0, 4
> + %2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %0, i32 255
> + %3 = load i32, i32 addrspace(2)* %2
> + store i32 %3, i32 addrspace(1)* %out
> + ret void
> +}
> +
> +; Offset is too big to fit in SMRD 8-bit offset, but small enough to
> +; fit in MUBUF offset.
> +; FIXME: We should be using the offset but we don't
> +
> +; CHECK-LABEL: {{^}}smrd_valu2_mubuf_offset:
> +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
> +define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) {
> +entry:
> + %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %1 = add i32 %0, 4
> + %2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %0, i32 256
> + %3 = load i32, i32 addrspace(2)* %2
> + store i32 %3, i32 addrspace(1)* %out
> + ret void
> +}
> +
> ; CHECK-LABEL: {{^}}s_load_imm_v8i32:
> ; CHECK: buffer_load_dwordx4
> ; CHECK: buffer_load_dwordx4
> Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> ===================================================================
> --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll
> @@ -1,5 +1,7 @@
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
> ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
> ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
> +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
> ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
> ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
>
> @@ -115,35 +117,6 @@
> ret void
> }
>
> -; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
> -; OPT: getelementptr i32, i32 addrspace(4)* %in
> -; OPT: br i1
> -; OPT-NOT: ptrtoint
> -
> -; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
> -; GCN: flat_load_dword
> -; GCN: {{^}}BB4_2:
> -
> -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
> -entry:
> - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
> - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
> - %tmp0 = icmp eq i32 %cond, 0
> - br i1 %tmp0, label %endif, label %if
> -
> -if:
> - %tmp1 = load i32, i32 addrspace(4)* %in.gep
> - br label %endif
> -
> -endif:
> - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> - store i32 %x, i32 addrspace(4)* %out.gep
> - br label %done
> -
> -done:
> - ret void
> -}
> -
> ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
> ; OPT-NOT: getelementptr [512 x i32]
> ; OPT: br i1
> @@ -153,7 +126,7 @@
> ; GCN: s_and_saveexec_b64
> ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
> ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
> -; GCN: {{^}}BB5_2:
> +; GCN: {{^}}BB4_2:
> define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
> entry:
> %alloca = alloca [512 x i32], align 4
> @@ -189,7 +162,7 @@
> ; GCN: s_and_saveexec_b64
> ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
> ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
> -; GCN: {{^}}BB6_2:
> +; GCN: {{^}}BB5_2:
> define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
> entry:
> %alloca = alloca [512 x i32], align 4
> @@ -222,7 +195,7 @@
> ; GCN: s_and_saveexec_b64
> ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
> ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
> -; GCN: {{^}}BB7_2:
> +; GCN: {{^}}BB6_2:
> define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
> entry:
> %offset.ext = zext i32 %offset to i64
> @@ -246,3 +219,222 @@
>
> attributes #0 = { nounwind readnone }
> attributes #1 = { nounwind }
> +
> +
> +
> +; OPT-LABEL: @test_sink_constant_small_offset_i32
> +; OPT-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
> +; OPT-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
> +; OPT-SI: getelementptr i32, i32 addrspace(2)*
> +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
> +
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
> +; OPT-SI: getelementptr i32, i32 addrspace(2)*
> +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 3{{$}}
> +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -4{{$}}
> +; GCN: s_add_u32
> +; GCN: s_addc_u32
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
> +; OPT: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN: s_add_u32
> +; GCN: s_addc_u32
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +
> +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
> +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
> +
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> +
> +; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
> +; OPT-SI: getelementptr i32, i32 addrspace(2)*
> +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
> +; OPT-VI: getelementptr i32, i32 addrspace(2)*
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
> +; GCN: s_and_saveexec_b64
> +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
> +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +
> +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
> +
> +; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
> +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
> +
> +; GCN: s_or_b64 exec, exec
> +define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(2)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(1)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
> @@ -0,0 +1,32 @@
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
> +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
> +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
> +
> +; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
> +; OPT: getelementptr i32, i32 addrspace(4)* %in
> +; OPT: br i1
> +; OPT-NOT: ptrtoint
> +
> +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
> +; GCN: flat_load_dword
> +; GCN: {{^}}BB0_2:
> +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
> +entry:
> + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
> + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
> + %tmp0 = icmp eq i32 %cond, 0
> + br i1 %tmp0, label %endif, label %if
> +
> +if:
> + %tmp1 = load i32, i32 addrspace(4)* %in.gep
> + br label %endif
> +
> +endif:
> + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> + store i32 %x, i32 addrspace(4)* %out.gep
> + br label %done
> +
> +done:
> + ret void
> +}
> Index: lib/Target/AMDGPU/SIISelLowering.h
> ===================================================================
> --- lib/Target/AMDGPU/SIISelLowering.h
> +++ lib/Target/AMDGPU/SIISelLowering.h
> @@ -57,6 +57,7 @@
> SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
>
> bool isLegalFlatAddressingMode(const AddrMode &AM) const;
> + bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
> public:
> SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
>
> Index: lib/Target/AMDGPU/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/AMDGPU/SIISelLowering.cpp
> +++ lib/Target/AMDGPU/SIISelLowering.cpp
> @@ -261,15 +261,50 @@
> return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
> }
>
> +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
> + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
> + // additionally can do r + r + i with addr64. 32-bit has more addressing
> + // mode options. Depending on the resource constant, it can also do
> + // (i64 r0) + (i32 r1) * (i14 i).
> + //
> + // Private arrays end up using a scratch buffer most of the time, so also
> + // assume those use MUBUF instructions. Scratch loads / stores are currently
> + // implemented as mubuf instructions with offen bit set, so slightly
> + // different than the normal addr64.
> + if (!isUInt<12>(AM.BaseOffs))
> + return false;
> +
> + // FIXME: Since we can split immediate into soffset and immediate offset,
> + // would it make sense to allow any immediate?
> +
> + switch (AM.Scale) {
> + case 0: // r + i or just i, depending on HasBaseReg.
> + return true;
> + case 1:
> + return true; // We have r + r or r + i.
> + case 2:
> + if (AM.HasBaseReg) {
> + // Reject 2 * r + r.
> + return false;
> + }
> +
> + // Allow 2 * r as r + r
> + // Or 2 * r + i is allowed as r + r + i.
> + return true;
> + default: // Don't allow n * r
> + return false;
> + }
> +}
> +
> bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
> const AddrMode &AM, Type *Ty,
> unsigned AS) const {
> // No global is ever allowed as a base.
> if (AM.BaseGV)
> return false;
>
> switch (AS) {
> - case AMDGPUAS::GLOBAL_ADDRESS:
> + case AMDGPUAS::GLOBAL_ADDRESS: {
> if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
> // Assume the we will use FLAT for all global memory accesses
> // on VI.
> @@ -282,51 +317,51 @@
> // because it has never been validated.
> return isLegalFlatAddressingMode(AM);
> }
> - // fall-through
> - case AMDGPUAS::PRIVATE_ADDRESS:
> - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
> - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
> - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
> - // additionally can do r + r + i with addr64. 32-bit has more addressing
> - // mode options. Depending on the resource constant, it can also do
> - // (i64 r0) + (i32 r1) * (i14 i).
> - //
> - // SMRD instructions have an 8-bit, dword offset.
> - //
> - // Assume nonunifom access, since the address space isn't enough to know
> - // what instruction we will use, and since we don't know if this is a load
> - // or store and scalar stores are only available on VI.
> - //
> - // We also know if we are doing an extload, we can't do a scalar load.
> - //
> - // Private arrays end up using a scratch buffer most of the time, so also
> - // assume those use MUBUF instructions. Scratch loads / stores are currently
> - // implemented as mubuf instructions with offen bit set, so slightly
> - // different than the normal addr64.
> - if (!isUInt<12>(AM.BaseOffs))
> - return false;
>
> - // FIXME: Since we can split immediate into soffset and immediate offset,
> - // would it make sense to allow any immediate?
> + return isLegalMUBUFAddressingMode(AM);
> + }
> + case AMDGPUAS::CONSTANT_ADDRESS: {
> + // If the offset isn't a multiple of 4, it probably isn't going to be
> + // correctly aligned.
> + if (AM.BaseOffs % 4 != 0)
> + return isLegalMUBUFAddressingMode(AM);
> +
> + // There are no SMRD extloads, so if we have to do a small type access we
> + // will use a MUBUF load.
> + // FIXME?: We also need to do this if unaligned, but we don't know the
> + // alignment here.
> + if (DL.getTypeStoreSize(Ty) < 4)
> + return isLegalMUBUFAddressingMode(AM);
> +
> + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
> + // SMRD instructions have an 8-bit, dword offset on SI.
> + if (!isUInt<8>(AM.BaseOffs / 4))
> + return false;
> + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
> + // On CI+, this can also be a 32-bit literal constant offset. If it fits
> + // in 8-bits, it can use a smaller encoding.
> + if (!isUInt<32>(AM.BaseOffs / 4))
> + return false;
> + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
> + // On VI, these use the SMEM format and the offset is 20-bit in bytes.
> + if (!isUInt<20>(AM.BaseOffs))
> + return false;
> + } else
> + llvm_unreachable("unhandled generation");
>
> - switch (AM.Scale) {
> - case 0: // r + i or just i, depending on HasBaseReg.
> + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
> return true;
> - case 1:
> - return true; // We have r + r or r + i.
> - case 2:
> - if (AM.HasBaseReg) {
> - // Reject 2 * r + r.
> - return false;
> - }
>
> - // Allow 2 * r as r + r
> - // Or 2 * r + i is allowed as r + r + i.
> + if (AM.Scale == 1 && AM.HasBaseReg)
> return true;
> - default: // Don't allow n * r
> - return false;
> - }
> +
> + return false;
> }
> +
> + case AMDGPUAS::PRIVATE_ADDRESS:
> + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
> + return isLegalMUBUFAddressingMode(AM);
> +
> case AMDGPUAS::LOCAL_ADDRESS:
> case AMDGPUAS::REGION_ADDRESS: {
> // Basic, single offset DS instructions allow a 16-bit unsigned immediate
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list