[llvm] r262153 - AMDGPU: More bits of frame index are known to be zero
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 27 12:26:57 PST 2016
Author: arsenm
Date: Sat Feb 27 14:26:57 2016
New Revision: 262153
URL: http://llvm.org/viewvc/llvm-project?rev=262153&view=rev
Log:
AMDGPU: More bits of frame index are known to be zero
The maximum private allocation for the whole GPU is 4G,
so the maximum possible index for a single workitem is the
maximum size divided by the smallest granularity for a dispatch.
This increases the number of known zero high bits, which
enables more offset folding. The maximum private size per
workitem with this is 128M but may be smaller still.
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPU.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/private-element-size.ll
llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.td?rev=262153&r1=262152&r2=262153&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td Sat Feb 27 14:26:57 2016
@@ -198,14 +198,6 @@ def FeatureMaxPrivateElementSize4 : Feat
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
-
-def FeatureEnableHugeScratchBuffer : SubtargetFeature<
- "huge-scratch-buffer",
- "EnableHugeScratchBuffer",
- "true",
- "Enable scratch buffer sizes greater than 128 GB"
->;
-
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
"EnableVGPRSpilling",
"true",
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp?rev=262153&r1=262152&r2=262153&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp Sat Feb 27 14:26:57 2016
@@ -84,7 +84,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const T
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false),
HasSMemRealTime(false), Has16BitInsts(false),
LDSBankCount(0),
- IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
+ IsaVersion(ISAVersion0_0_0),
EnableSIScheduler(false), FrameLowering(nullptr),
InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=262153&r1=262152&r2=262153&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Sat Feb 27 14:26:57 2016
@@ -93,7 +93,6 @@ private:
bool FeatureDisable;
int LDSBankCount;
unsigned IsaVersion;
- bool EnableHugeScratchBuffer;
bool EnableSIScheduler;
std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
@@ -293,10 +292,6 @@ public:
return false;
}
- bool enableHugeScratchBuffer() const {
- return EnableHugeScratchBuffer;
- }
-
bool enableSIScheduler() const {
return EnableSIScheduler;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=262153&r1=262152&r2=262153&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Sat Feb 27 14:26:57 2016
@@ -1178,25 +1178,35 @@ SDValue SITargetLowering::LowerFrameInde
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- // A FrameIndex node represents a 32-bit offset into scratch memory. If
- // the high bit of a frame index offset were to be set, this would mean
- // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
- // scratch buffer, with 64 being the number of threads per wave.
+ // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+ // high bit of a frame index offset were to be set, this would mean that it
+ // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+ // buffer, with 64 being the number of threads per wave.
//
- // If we know the machine uses less than 128GB of scratch, then we can
- // amrk the high bit of the FrameIndex node as known zero,
- // which is important, because it means in most situations we can
- // prove that values derived from FrameIndex nodes are non-negative.
- // This enables us to take advantage of more addressing modes when
- // accessing scratch buffers, since for scratch reads/writes, the register
- // offset must always be positive.
+ // The maximum private allocation for the entire GPU is 4G, and we are
+ // concerned with the largest the index could ever be for an individual
+ // workitem. This will occur with the minmum dispatch size. If a program
+ // requires more, the dispatch size will be reduced.
+ //
+ // With this limit, we can mark the high bit of the FrameIndex node as known
+ // zero, which is important, because it means in most situations we can prove
+ // that values derived from FrameIndex nodes are non-negative. This enables us
+ // to take advantage of more addressing modes when accessing scratch buffers,
+ // since for scratch reads/writes, the register offset must always be
+ // positive.
- SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
- if (Subtarget->enableHugeScratchBuffer())
- return TFI;
+ uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
+
+ // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+ // granularity. It is probably a full wave.
+ uint64_t MinGranularity = 32;
+ unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+ EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+ SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
- DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+ DAG.getValueType(ExtVT));
}
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
Modified: llvm/trunk/test/CodeGen/AMDGPU/private-element-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/private-element-size.ll?rev=262153&r1=262152&r2=262153&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/private-element-size.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/private-element-size.ll Sat Feb 27 14:26:57 2016
@@ -33,9 +33,9 @@
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -99,10 +99,14 @@ entry:
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Modified: llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll?rev=262153&r1=262152&r2=262153&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll Sat Feb 27 14:26:57 2016
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; When a frame index offset is more than 12-bits, make sure we don't store
; it in mubuf's offset field.
@@ -102,8 +100,7 @@ entry:
}
; GCN-LABEL: @pos_vaddr_offse
-; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
-; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
entry:
%array = alloca [8192 x i32]
More information about the llvm-commits
mailing list