[llvm] 42b9ea8 - [AMDGPU] Increase max scratch allocation for GFX12 (#77625)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 17 02:25:32 PST 2024
Author: Jay Foad
Date: 2024-01-17T10:25:28Z
New Revision: 42b9ea841e2d9fe186b8892be713443b5f680565
URL: https://github.com/llvm/llvm-project/commit/42b9ea841e2d9fe186b8892be713443b5f680565
DIFF: https://github.com/llvm/llvm-project/commit/42b9ea841e2d9fe186b8892be713443b5f680565.diff
LOG: [AMDGPU] Increase max scratch allocation for GFX12 (#77625)
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIDefines.h
llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d317a733d4331c..10f7e7a26edb4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -981,8 +981,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX11
- ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ STM.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+ : STM.getGeneration() == AMDGPUSubtarget::GFX11
+ ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
: S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
@@ -993,8 +995,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX11
- ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ STM.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+ : STM.getGeneration() == AMDGPUSubtarget::GFX11
+ ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
: S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 52b9b94fa61e1d..7c0ed6fe3b91a0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -297,12 +297,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned getMaxWaveScratchSize() const {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
- if (getGeneration() < GFX11) {
- // 13-bit field in units of 256-dword.
- return (256 * 4) * ((1 << 13) - 1);
+ if (getGeneration() >= GFX12) {
+ // 18-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 18) - 1);
}
- // 15-bit field in units of 64-dword.
- return (64 * 4) * ((1 << 15) - 1);
+ if (getGeneration() == GFX11) {
+ // 15-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 15) - 1);
+ }
+ // 13-bit field in units of 256-dword.
+ return (256 * 4) * ((1 << 13) - 1);
}
/// Return the number of high bits known to be zero for a frame index.
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index d6492a5f405184..8ab66d4fd5b861 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1176,11 +1176,13 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
-#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
+#define S_00B860_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
+#define S_00B860_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
-#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index 94295dc3af3136..5882043caa0bd1 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,62 +1,81 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 16383
+ %masked = and i32 %toint, 65535
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 65535
+ %masked = and i32 %toint, 131071
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; WAVE64-NOT: [[FI]]
-; WAVE64: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-
-; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]]
-; WAVE32: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K-NOT: v_and_b32
+; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 131071
+ %masked = and i32 %toint, 262143
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN-NOT: [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K-NOT: v_and_b32
+; SCRATCH1024K-NOT: v_and_b32
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 262143
+ %masked = and i32 %toint, 1048575
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-attributes #0 = { nounwind }
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN-NOT: v_and_b32
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 0, ptr addrspace(5) %alloca
+ %toint = ptrtoint ptr addrspace(5) %alloca to i32
+ %masked = and i32 %toint, 2097151
+ store volatile i32 %masked, ptr addrspace(1) undef
+ ret void
+}
More information about the llvm-commits
mailing list