[llvm] r309471 - AMDGPU: Teach isLegalAddressingMode about global_* instructions

Fri Jul 28 18:12:32 PDT 2017

Author: arsenm
Date: Fri Jul 28 18:12:31 2017
New Revision: 309471

URL: http://llvm.org/viewvc/llvm-project?rev=309471&view=rev
Log:
AMDGPU: Teach isLegalAddressingMode about global_* instructions

Also refine the flat check to respect flat-for-global feature,
and constant fallback should check global handling, not
specifically MUBUF.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
    llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=309471&r1=309470&r2=309471&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Jul 28 18:12:31 2017
@@ -586,6 +586,26 @@ bool SITargetLowering::isLegalFlatAddres
   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
 }
 
+bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
+  if (Subtarget->hasFlatGlobalInsts())
+    return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
+
+  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
+      // Assume the we will use FLAT for all global memory accesses
+      // on VI.
+      // FIXME: This assumption is currently wrong.  On VI we still use
+      // MUBUF instructions for the r + i addressing mode.  As currently
+      // implemented, the MUBUF instructions only work on buffer < 4GB.
+      // It may be possible to support > 4GB buffers with MUBUF instructions,
+      // by setting the stride value in the resource descriptor which would
+      // increase the size limit to (stride * 4GB).  However, this is risky,
+      // because it has never been validated.
+    return isLegalFlatAddressingMode(AM);
+  }
+
+  return isLegalMUBUFAddressingMode(AM);
+}
+
 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
   // additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -628,22 +648,10 @@ bool SITargetLowering::isLegalAddressing
   if (AM.BaseGV)
     return false;
 
-  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
-    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-      // Assume the we will use FLAT for all global memory accesses
-      // on VI.
-      // FIXME: This assumption is currently wrong.  On VI we still use
-      // MUBUF instructions for the r + i addressing mode.  As currently
-      // implemented, the MUBUF instructions only work on buffer < 4GB.
-      // It may be possible to support > 4GB buffers with MUBUF instructions,
-      // by setting the stride value in the resource descriptor which would
-      // increase the size limit to (stride * 4GB).  However, this is risky,
-      // because it has never been validated.
-      return isLegalFlatAddressingMode(AM);
-    }
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS)
+    return isLegalGlobalAddressingMode(AM);
 
-    return isLegalMUBUFAddressingMode(AM);
-  } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -655,7 +663,7 @@ bool SITargetLowering::isLegalAddressing
     // FIXME?: We also need to do this if unaligned, but we don't know the
     // alignment here.
     if (DL.getTypeStoreSize(Ty) < 4)
-      return isLegalMUBUFAddressingMode(AM);
+      return isLegalGlobalAddressingMode(AM);
 
     if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
       // SMRD instructions have an 8-bit, dword offset on SI.

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=309471&r1=309470&r2=309471&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Fri Jul 28 18:12:31 2017
@@ -118,6 +118,7 @@ class SITargetLowering final : public AM
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+  bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
   unsigned isCFIntrinsic(const SDNode *Intr) const;

Modified: llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll?rev=309471&r1=309470&r2=309471&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll Fri Jul 28 18:12:31 2017
@@ -1,9 +1,11 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
@@ -14,7 +16,6 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; OPT-CI: getelementptr i8,
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
-; GCN: {{^}}BB0_2:
 define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -42,7 +43,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -69,7 +71,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -96,7 +99,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -670,6 +674,67 @@ endif:
   br label %done
 
 done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
+; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+; OPT-SICIV: br
+; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
+
+; OPT-GFX9: br
+; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
+
+; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
+; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
+define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
+; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
+; OPT: br
+; OPT: load i8, i8 addrspace(1)* %in.gep
+
+; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
+define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
   ret void
 }