[llvm] r309471 - AMDGPU: Teach isLegalAddressingMode about global_* instructions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 28 18:12:32 PDT 2017
Author: arsenm
Date: Fri Jul 28 18:12:31 2017
New Revision: 309471
URL: http://llvm.org/viewvc/llvm-project?rev=309471&view=rev
Log:
AMDGPU: Teach isLegalAddressingMode about global_* instructions
Also refine the flat check to respect flat-for-global feature,
and constant fallback should check global handling, not
specifically MUBUF.
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=309471&r1=309470&r2=309471&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Jul 28 18:12:31 2017
@@ -586,6 +586,26 @@ bool SITargetLowering::isLegalFlatAddres
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
+bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
+ if (Subtarget->hasFlatGlobalInsts())
+ return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
+
+ if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
+
+ return isLegalMUBUFAddressingMode(AM);
+}
+
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -628,22 +648,10 @@ bool SITargetLowering::isLegalAddressing
if (AM.BaseGV)
return false;
- if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- // Assume the we will use FLAT for all global memory accesses
- // on VI.
- // FIXME: This assumption is currently wrong. On VI we still use
- // MUBUF instructions for the r + i addressing mode. As currently
- // implemented, the MUBUF instructions only work on buffer < 4GB.
- // It may be possible to support > 4GB buffers with MUBUF instructions,
- // by setting the stride value in the resource descriptor which would
- // increase the size limit to (stride * 4GB). However, this is risky,
- // because it has never been validated.
- return isLegalFlatAddressingMode(AM);
- }
+ if (AS == AMDGPUASI.GLOBAL_ADDRESS)
+ return isLegalGlobalAddressingMode(AM);
- return isLegalMUBUFAddressingMode(AM);
- } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -655,7 +663,7 @@ bool SITargetLowering::isLegalAddressing
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
if (DL.getTypeStoreSize(Ty) < 4)
- return isLegalMUBUFAddressingMode(AM);
+ return isLegalGlobalAddressingMode(AM);
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=309471&r1=309470&r2=309471&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Fri Jul 28 18:12:31 2017
@@ -118,6 +118,7 @@ class SITargetLowering final : public AM
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;
Modified: llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll?rev=309471&r1=309470&r2=309471&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll Fri Jul 28 18:12:31 2017
@@ -1,9 +1,11 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
@@ -14,7 +16,6 @@ target datalayout = "e-p:32:32-p1:64:64-
; OPT-CI: getelementptr i8,
; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
-; GCN: {{^}}BB0_2:
define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -42,7 +43,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
; GCN: {{^}}BB1_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -69,7 +71,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
; GCN: {{^}}BB2_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -96,7 +99,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
; GCN: {{^}}BB3_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -670,6 +674,67 @@ endif:
br label %done
done:
+ ret void
+}
+
+; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
+; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+; OPT-SICIV: br
+; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
+
+; OPT-GFX9: br
+; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
+
+; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
+; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
+define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
+ %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
+ br i1 %tmp0, label %endif, label %if
+
+if:
+ %tmp1 = load i8, i8 addrspace(1)* %in.gep
+ %tmp2 = sext i8 %tmp1 to i32
+ br label %endif
+
+endif:
+ %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+ store i32 %x, i32 addrspace(1)* %out.gep
+ br label %done
+
+done:
+ ret void
+}
+
+; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
+; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
+; OPT: br
+; OPT: load i8, i8 addrspace(1)* %in.gep
+
+; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
+define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
+ %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
+ br i1 %tmp0, label %endif, label %if
+
+if:
+ %tmp1 = load i8, i8 addrspace(1)* %in.gep
+ %tmp2 = sext i8 %tmp1 to i32
+ br label %endif
+
+endif:
+ %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+ store i32 %x, i32 addrspace(1)* %out.gep
+ br label %done
+
+done:
ret void
}
More information about the llvm-commits
mailing list