[llvm] [AMDGPU] Add pal metadata 3.0 support to callable pal funcs (PR #67104)
David Stuttard via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 22 02:22:18 PDT 2023
https://github.com/dstutt created https://github.com/llvm/llvm-project/pull/67104
None
>From 259138920126f09149b488fc54e8d2a7da969ca4 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Thu, 24 Aug 2023 16:45:50 +0100
Subject: [PATCH] [AMDGPU] Add pal metadata 3.0 support to callable pal funcs
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 28 +-
.../AMDGPU/pal-metadata-3.0-callable.ll | 290 ++++++++++++++++++
2 files changed, 314 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b2360ce30fd6edb..22ecd3656d00a28 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1098,10 +1098,30 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
StringRef FnName = MF.getFunction().getName();
MD->setFunctionScratchSize(FnName, MFI.getStackSize());
- // Set compute registers
- MD->setRsrc1(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
- MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
+ if (MD->getPALMajorVersion() < 3) {
+ // Set compute registers
+ MD->setRsrc1(CallingConv::AMDGPU_CS,
+ CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
+ MD->setRsrc2(CallingConv::AMDGPU_CS,
+ CurrentProgramInfo.getComputePGMRSrc2());
+ } else {
+ MD->setHwStage(CallingConv::AMDGPU_CS, ".ieee_mode",
+ (bool)CurrentProgramInfo.IEEEMode);
+ MD->setHwStage(CallingConv::AMDGPU_CS, ".wgp_mode",
+ (bool)CurrentProgramInfo.WgpMode);
+ MD->setHwStage(CallingConv::AMDGPU_CS, ".mem_ordered",
+ (bool)CurrentProgramInfo.MemOrdered);
+
+ MD->setHwStage(CallingConv::AMDGPU_CS, ".trap_present",
+ (bool)CurrentProgramInfo.TrapHandlerEnable);
+ MD->setHwStage(CallingConv::AMDGPU_CS, ".excp_en",
+ CurrentProgramInfo.EXCPEnable);
+
+ const unsigned LdsDwGranularity = 128;
+ MD->setHwStage(CallingConv::AMDGPU_CS, ".lds_size",
+ (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
+ sizeof(uint32_t)));
+ }
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
new file mode 100644
index 000000000000000..d4a5f61aced61a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -0,0 +1,290 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT: - .api: Vulkan
+; CHECK-NEXT: .compute_registers:
+; CHECK-NEXT: .tg_size_en: true
+; CHECK-NEXT: .tgid_x_en: false
+; CHECK-NEXT: .tgid_y_en: false
+; CHECK-NEXT: .tgid_z_en: false
+; CHECK-NEXT: .tidig_comp_cnt: 0x1
+; CHECK-NEXT: .hardware_stages:
+; CHECK-NEXT: .cs:
+; CHECK-NEXT: .checksum_value: 0x9444d7d0
+; CHECK-NEXT: .debug_mode: 0
+; CHECK-NEXT: .excp_en: 0
+; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .ieee_mode: true
+; CHECK-NEXT: .image_op: false
+; CHECK-NEXT: .lds_size: 0x200
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .sgpr_limit: 0x6a
+; CHECK-NEXT: .threadgroup_dimensions:
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: - 0x400
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: .trap_present: false
+; CHECK-NEXT: .user_data_reg_map:
+; CHECK-NEXT: - 0x10000000
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: .user_sgprs: 0x3
+; CHECK-NEXT: .vgpr_limit: 0x100
+; CHECK-NEXT: .wavefront_size: 0x40
+; CHECK-NEXT: .wgp_mode: true
+; CHECK: .registers: {}
+; CHECK-NEXT: .shader_functions:
+; CHECK-NEXT: dynamic_stack:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: dynamic_stack_loop:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: multiple_stack:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x21
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: no_stack:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x20
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: no_stack_call:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: no_stack_extern_call:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: no_stack_extern_call_many_args:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: no_stack_indirect_call:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_lds:
+; CHECK-NEXT: .lds_size: 0x100
+; CHECK-NEXT: .sgpr_count: 0x20
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: simple_lds_recurse:
+; CHECK-NEXT: .lds_size: 0x100
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x29
+; CHECK-NEXT: simple_stack:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x21
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: simple_stack_call:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x4
+; CHECK-NEXT: simple_stack_extern_call:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_stack_indirect_call:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_stack_recurse:
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x2a
+; CHECK:amdpal.version:
+; CHECK-NEXT: - 0x3
+; CHECK-NEXT: - 0
+; CHECK-NEXT:...
+; CHECK-NEXT: .end_amdgpu_pal_metadata
+
+declare amdgpu_gfx float @extern_func(float) #0
+declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
+
+ at funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+ %add = fadd float %arg0, 1.0
+ ret float %add
+}
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ %stack2 = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack2
+ %val2 = load volatile float, ptr addrspace(5) %stack2
+ %add2 = fadd float %add, %val2
+ ret float %add2
+}
+
+define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
+bb0:
+ %cmp = fcmp ogt float %arg0, 0.0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ br label %bb2
+
+bb2:
+ %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
+ ret float %res
+}
+
+define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
+bb0:
+ br label %bb1
+
+bb1:
+ %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ %cmp = icmp sgt i32 %ctr, 0
+ %newctr = sub i32 %ctr, 1
+ br i1 %cmp, label %bb1, label %bb2
+
+bb2:
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+ %res = call amdgpu_gfx float @simple_stack(float %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @simple_stack(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+ %res = call amdgpu_gfx float @extern_func(float %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @extern_func(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+ %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+ %fptr = load ptr, ptr addrspace(4) @funcptr
+ call amdgpu_gfx void %fptr()
+ ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %fptr = load ptr, ptr addrspace(4) @funcptr
+ call amdgpu_gfx void %fptr()
+ %add = fadd float %arg0, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+ at lds = internal addrspace(3) global [64 x float] undef
+
+define amdgpu_gfx float @simple_lds(float %arg0) #0 {
+ %val = load float, ptr addrspace(3) @lds
+ ret float %val
+}
+
+define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
+ %val = load float, ptr addrspace(3) @lds
+ %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
+ ret float %res
+}
+
+attributes #0 = { nounwind }
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
More information about the llvm-commits
mailing list