[llvm] [AMDGPU] Include unused preload kernarg in KD total SGPR count (PR #104743)

Mon Sep 23 10:06:00 PDT 2024

https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/104743

>From 2e85e2a5d0e9071081899addfba3b133ab44c758 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Mon, 19 Aug 2024 01:02:17 -0700
Subject: [PATCH] [AMDGPU] Include unused preload kernarg in KD total SGPR
 count

Unlike with implicitly preloaded data UserSGPRs firmware is unable to
handle cases where SGPRs for kernel arguments contain prelaoded data but
not are not explicitly referenced in the kernel. We need to include
these preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT calculation
to not clobber SGPRs in adjacent waves.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  8 ++
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 13 +++-
 .../amdhsa-kernarg-preload-num-sgprs.ll       | 73 +++++++++++++++++++
 .../MC/AMDGPU/amdhsa-kd-kernarg-preload.s     | 21 ++++++
 4 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
 create mode 100644 llvm/test/MC/AMDGPU/amdhsa-kd-kernarg-preload.s

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b90d245b7bd394..1d5a279068c12f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -900,6 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+  } else if (isKernel(F.getCallingConv()) && MFI->getNumKernargPreloadedSGPRs()) {
+    // Consider cases where the total number of UserSGPRs with trailing
+    // allocated preload SGPRs, is greater than the number of explicitly
+    // referenced SGPRs.
+    const MCExpr *MaxUserSGPRs = MCBinaryExpr::createAdd(
+        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
+    ProgInfo.NumSGPR =
+        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, MaxUserSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f21c091bceea29..555b8cb5c6e53d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5834,6 +5834,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (!Seen.contains(".amdhsa_next_free_sgpr"))
     return TokError(".amdhsa_next_free_sgpr directive is required");
 
+  unsigned UserSGPRCount = ExplicitUserSGPRCount.value_or(ImpliedUserSGPRCount);
+
+  // Consider the case where the total number of UserSGPRs with trailing
+  // allocated preload SGPRs, is greater than the number of explicitly
+  // referenced SGPRs.
+  if (PreloadLength) {
+    MCContext &Ctx = getContext();
+    NextFreeSGPR = AMDGPUMCExpr::createMax(
+        {NextFreeSGPR, MCConstantExpr::create(UserSGPRCount, Ctx)}, Ctx);
+  }
+
   const MCExpr *VGPRBlocks;
   const MCExpr *SGPRBlocks;
   if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
@@ -5870,8 +5881,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
     return TokError("amdgpu_user_sgpr_count smaller than than implied by "
                     "enabled user SGPRs");
 
-  unsigned UserSGPRCount = ExplicitUserSGPRCount.value_or(ImpliedUserSGPRCount);
-
   if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
     return TokError("too many user SGPRs enabled");
   AMDGPU::MCKernelDescriptor::bits_set(
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
new file mode 100644
index 00000000000000..c8ba6722d9d85e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s
+
+; OBJDUMP: Contents of section .rodata:
+; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000  ................
+; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NOT:  0030 0000af00 94130000 1a000400 00000000  ................
+; OBJDUMP-NEXT: 0030 4000af00 94130000 1a000400 00000000  @...............
+
+; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM: .amdhsa_user_sgpr_count 10
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; NumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
+
+; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
+; feild that are not explicitly referenced in the kernel. This test has 6 implicit
+; user SPGRs enabled, 4 preloaded kernarg SGPRs, plus 6 extra SGPRs allocated
+; for flat scratch, ect. The total number of allocated SGPRs encoded in the
+; kernel descriptor should be 16. That's a 1 in the KD field since the granule
+; size is 8 and it's NumGranules - 1. The encoding for that looks like '40'.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
+
+; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
+; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
+
+; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM: .amdhsa_user_sgpr_count 10
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; NumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
+
+; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
+; implicit, and 6 extra.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
+
+; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000  @...............
+
+; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM: .amdhsa_user_sgpr_count 3
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; NumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
+
+; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
+
+; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+
+; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM: .amdhsa_user_sgpr_count 2
+; ASM: .amdhsa_next_free_sgpr 0
+; ASM: ; NumSgprs: 6
+; ASM: ; NumSGPRsForWavesPerEU: 6
+
+; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
+; Encoded like '00'.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void }
+
+attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/MC/AMDGPU/amdhsa-kd-kernarg-preload.s b/llvm/test/MC/AMDGPU/amdhsa-kd-kernarg-preload.s
new file mode 100644
index 00000000000000..f4ae23fc0aa7b9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/amdhsa-kd-kernarg-preload.s
@@ -0,0 +1,21 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s -o - | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx940"
+
+.rodata
+
+// Account for preload kernarg SGPRs in KD field GRANULATED_WAVEFRONT_SGPR_COUNT.
+
+// OBJDUMP:      Contents of section .rodata:
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000  ................
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+// OBJDUMP-NOT:  0030 0000ac00 92000000 00000900 00000000  ................
+// OBJDUMP-NEXT: 0030 4000ac00 92000000 00000900 00000000  @...............
+
+.amdhsa_kernel amdhsa_kd_kernarg
+  .amdhsa_user_sgpr_kernarg_preload_length 9
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel