[llvm] r361541 - AMDGPU: Correct maximum possible private allocation size

Thu May 23 12:38:15 PDT 2019

Author: arsenm
Date: Thu May 23 12:38:14 2019
New Revision: 361541

URL: http://llvm.org/viewvc/llvm-project?rev=361541&view=rev
Log:
AMDGPU: Correct maximum possible private allocation size

We were assuming a much larger possible per-wave visible stack
allocation than is possible:

https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/faa3ae51388517353afcdaf9c16621f879ef0a59/src/core/runtime/amd_gpu_agent.cpp#L70

Based on this, we can assume the high 15 bits of a frame index or sret
are 0. The frame index value is the per-lane offset, so the maximum
frame index value is MAX_WAVE_SCRATCH / wavesize.

Remove the corresponding subtarget feature and option that made
this configurable.

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPU.td
    llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
    llvm/trunk/test/CodeGen/AMDGPU/huge-private-buffer.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.td?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td Thu May 23 12:38:14 2019
@@ -458,13 +458,6 @@ def FeatureMaxPrivateElementSize4 : Feat
 def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
 def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
 
-def FeatureEnableHugePrivateBuffer : SubtargetFeature<
-  "huge-private-buffer",
-  "EnableHugePrivateBuffer",
-  "true",
-  "Enable private/scratch buffer sizes greater than 128 GB"
->;
-
 def FeatureDumpCode : SubtargetFeature <"DumpCode",
   "DumpCode",
   "true",

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp Thu May 23 12:38:14 2019
@@ -190,7 +190,6 @@ GCNSubtarget::GCNSubtarget(const Triple
     EnableCuMode(false),
     TrapHandler(false),
 
-    EnableHugePrivateBuffer(false),
     EnableLoadStoreOpt(false),
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Thu May 23 12:38:14 2019
@@ -299,7 +299,6 @@ protected:
   bool TrapHandler;
 
   // Used as options.
-  bool EnableHugePrivateBuffer;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
@@ -377,6 +376,9 @@ private:
   SITargetLowering TLInfo;
   SIFrameLowering FrameLowering;
 
+  // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
+  static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
+
 public:
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                const GCNTargetMachine &TM);
@@ -436,6 +438,11 @@ public:
     return Log2_32(WavefrontSize);
   }
 
+  /// Return the number of high bits known to be zero fror a frame index.
+  unsigned getKnownHighZeroBitsForFrameIndex() const {
+    return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+  }
+
   int getLDSBankCount() const {
     return LDSBankCount;
   }
@@ -526,10 +533,6 @@ public:
     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
   }
 
-  bool enableHugePrivateBuffer() const {
-    return EnableHugePrivateBuffer;
-  }
-
   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
   }

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Thu May 23 12:38:14 2019
@@ -93,12 +93,6 @@ static cl::opt<bool> EnableVGPRIndexMode
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
   cl::init(false));
 
-static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
-  "amdgpu-frame-index-zero-bits",
-  cl::desc("High bits of frame index assumed to be zero"),
-  cl::init(5),
-  cl::ReallyHidden);
-
 static cl::opt<bool> DisableLoopAlignment(
   "amdgpu-disable-loop-alignment",
   cl::desc("Do not align and prefetch loops"),
@@ -2059,13 +2053,14 @@ SDValue SITargetLowering::LowerFormalArg
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
-    if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+    if (Arg.Flags.isSRet()) {
       // The return object should be reasonably addressable.
 
       // FIXME: This helps when the return is a real sret. If it is a
       // automatically inserted sret (i.e. CanLowerReturn returns false), an
       // extra copy is inserted in SelectionDAGBuilder which obscures this.
-      unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+      unsigned NumBits
+        = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
     }
@@ -9970,14 +9965,10 @@ void SITargetLowering::computeKnownBitsF
   TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
                                                 DAG, Depth);
 
-  if (getSubtarget()->enableHugePrivateBuffer())
-    return;
-
-  // Technically it may be possible to have a dispatch with a single workitem
-  // that uses the full private memory size, but that's not really useful. We
-  // can't use vaddr in MUBUF instructions if we don't know the address
+  // Set the high bits to zero based on the maximum allowed scratch size per
+  // wave. We can't use vaddr in MUBUF instructions if we don't know the address
   // calculation won't overflow, so assume the sign bit is never set.
-  Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+  Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
 }
 
 unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

Modified: llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll Thu May 23 12:38:14 2019
@@ -60,7 +60,7 @@ define void @func_add_constant_to_fi_i32
 ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
 
-; GCN-NEXT: v_mul_lo_u32 v0, v0, 9
+; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_other_fi_user_i32() #0 {
@@ -172,7 +172,7 @@ ret:
 ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]]
 
-; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9
+; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
 ; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
@@ -196,7 +196,7 @@ define void @func_other_fi_user_non_inli
 ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]]
 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]]
 
-; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9
+; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
 ; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)

Modified: llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll Thu May 23 12:38:14 2019
@@ -570,4 +570,24 @@ define { <3 x float>, i32 } @v3f32_struc
   ret { <3 x float>, i32 } %insert.4
 }
 
+; GCN-LABEL: {{^}}void_func_sret_max_known_zero_bits:
+; GCN: v_lshrrev_b32_e32 [[LSHR16:v[0-9]+]], 16, v0
+; GCN: ds_write_b32 {{v[0-9]+}}, [[LSHR16]]
+
+; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0
+; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]]
+; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]]
+define void @void_func_sret_max_known_zero_bits(i8 addrspace(5)* sret %arg0) #0 {
+  %arg0.int = ptrtoint i8 addrspace(5)* %arg0 to i32
+
+  %lshr0 = lshr i32 %arg0.int, 16
+  %lshr1 = lshr i32 %arg0.int, 17
+  %lshr2 = lshr i32 %arg0.int, 18
+
+  store volatile i32 %lshr0, i32 addrspace(3)* undef
+  store volatile i32 %lshr1, i32 addrspace(3)* undef
+  store volatile i32 %lshr2, i32 addrspace(3)* undef
+  ret void
+}
+
 attributes #0 = { nounwind }

Modified: llvm/trunk/test/CodeGen/AMDGPU/huge-private-buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/huge-private-buffer.ll?rev=361541&r1=361540&r2=361541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/huge-private-buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/huge-private-buffer.ll Thu May 23 12:38:14 2019
@@ -1,31 +1,42 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %alloca
+  %toint = ptrtoint i32 addrspace(5)* %alloca to i32
+  %masked = and i32 %toint, 65535
+  store volatile i32 %masked, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
 ; GCN-NOT: [[FI]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 {
+define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
   %toint = ptrtoint i32 addrspace(5)* %alloca to i32
-  %masked = and i32 %toint, 2147483647
+  %masked = and i32 %toint, 131071
   store volatile i32 %masked, i32 addrspace(1)* undef
   ret void
 }
 
-; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_huge:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN-DAG: buffer_store_dword
-; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]]
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 {
+; GCN-NOT: [[FI]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
+define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
   %toint = ptrtoint i32 addrspace(5)* %alloca to i32
-  %masked = and i32 %toint, 2147483647
+  %masked = and i32 %toint, 262143
   store volatile i32 %masked, i32 addrspace(1)* undef
   ret void
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "target-features"="+huge-private-buffer" }