[llvm] 3b8539c - [NVPTX] use incomplete aggregate initializers (#79062)

Wed Jan 24 09:24:32 PST 2024

Author: Alex MacLean
Date: 2024-01-24T09:24:28-08:00
New Revision: 3b8539c9dc0bc38ebea903e038257ed4328f290b

URL: https://github.com/llvm/llvm-project/commit/3b8539c9dc0bc38ebea903e038257ed4328f290b
DIFF: https://github.com/llvm/llvm-project/commit/3b8539c9dc0bc38ebea903e038257ed4328f290b.diff

LOG: [NVPTX] use incomplete aggregate initializers (#79062)

The PTX ISA specifies that initializers may be incomplete ([5.4.4.
Initializers](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers))
> As in C, array initializers may be incomplete, i.e., the number of
initializer elements may be less than the extent of the corresponding
array dimension, with remaining array locations initialized to the
default value for the specified array type.

Emitting initializers in this form is preferable because it reduces the
size of the PTX, in some cases significantly, and can improve compile
time of ptxas as a result.

Added: 
    llvm/test/CodeGen/NVPTX/global-incomplete-init.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
    llvm/test/CodeGen/NVPTX/globals_init.ll
    llvm/test/CodeGen/NVPTX/i128-global.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6a03c7b0abc3444..9534ad9b4f83d7a 100644

--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1292,10 +1292,21 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
 
 void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
   unsigned int ptrSize = AP.MAI->getCodePointerSize();
-  symbolPosInBuffer.push_back(size);
+  // Do not emit trailing zero initializers. They will be zero-initialized by
+  // ptxas. This saves on both space requirements for the generated PTX and on
+  // memory use by ptxas. (See:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#global-state-space)
+  unsigned int InitializerCount = size;
+  // TODO: symbols make this harder, but it would still be good to trim trailing
+  // 0s for aggs with symbols as well.
+  if (numSymbols() == 0)
+    while (InitializerCount >= 1 && !buffer[InitializerCount - 1])
+      InitializerCount--;
+
+  symbolPosInBuffer.push_back(InitializerCount);
   unsigned int nSym = 0;
   unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
-  for (unsigned int pos = 0; pos < size;) {
+  for (unsigned int pos = 0; pos < InitializerCount;) {
     if (pos)
       os << ", ";
     if (pos != nextSymbolPos) {

diff  --git a/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
new file mode 100644
index 000000000000000..d9c08b8dad57fff
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+; Make sure the globals constants have trailing zeros properly trimmed
+
+; basic case
+; CHECK-DAG: .b8 A[8] = {3, 4, 0, 0, 5};
+ at A = global [8 x i8] [i8 3, i8 4, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0]
+
+; all-zeros
+; CHECK-DAG: .b8 B[2];
+ at B = global [2 x i8] [i8 0, i8 0]
+
+; all-non-zeros
+; CHECK-DAG: .b8 C[4] = {1, 2, 3, 4};
+ at C = global [4 x i8] [i8 1, i8 2, i8 3, i8 4]
+
+; initializer with a symbol, the last 0 could be default initialized
+; CHECK-DAG: .u8 e = 1;
+; CHECK-DAG: .u64 D[4] = {e, 0, e, 0};
+ at e = addrspace(1) global i8 1
+ at D = addrspace(1) global [4 x ptr addrspace(1)] [ptr addrspace(1) @e, ptr addrspace(1) null, ptr addrspace(1) @e, ptr addrspace(1) null]

diff  --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index bdfba6b602aecb5..d607c6e822a9405 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -26,6 +26,6 @@
 @Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56]
 
 ; Make sure we fill in alignment gaps correctly.
-; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0};
+; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1};
 @GblU = global {i16, i32, i8} {i16 1543, i32 33752069, i8 1}
 

diff  --git a/llvm/test/CodeGen/NVPTX/i128-global.ll b/llvm/test/CodeGen/NVPTX/i128-global.ll
index cdebc65272f8627..c87087484da1f9f 100644
--- a/llvm/test/CodeGen/NVPTX/i128-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-global.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+; CHECK: .visible .global .align 16 .b8 G1[16] = {1};
 @G1 = global i128 1
 
 ; CHECK: .visible .global .align 16 .b8 G2[16];