[llvm] [NVPTX] use incomplete aggregate initializers (PR #79062)

Mon Jan 22 16:57:19 PST 2024

https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/79062

>From 09c54e6ddf67e4e060716480f5fa17fef5246773 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Sat, 13 Jan 2024 00:50:34 +0000
Subject: [PATCH 1/2] [NVPTX] use incomplete aggregate initializers

---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 14 ++++++++++++--
 llvm/test/CodeGen/NVPTX/globals_init.ll   |  2 +-
 llvm/test/CodeGen/NVPTX/i128-global.ll    |  2 +-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6a03c7b0abc3444..bccb773b0c5f50d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1292,10 +1292,20 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
 
 void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
   unsigned int ptrSize = AP.MAI->getCodePointerSize();
-  symbolPosInBuffer.push_back(size);
+  // Last contingous sequnce of 0 bytes are unintialized to get default
+  // initialization in ptxas. This avoids ptxas memory consumption for large
+  // aggregates.
+  unsigned int count = size;
+  // TODO: symbols make this harder, but it would still be good to trim trailing
+  // 0s for aggs with symbols as well.
+  if (numSymbols() == 0)
+    while (count >= 1 && !buffer[count - 1])
+      count--;
+
+  symbolPosInBuffer.push_back(count);
   unsigned int nSym = 0;
   unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
-  for (unsigned int pos = 0; pos < size;) {
+  for (unsigned int pos = 0; pos < count;) {
     if (pos)
       os << ", ";
     if (pos != nextSymbolPos) {
diff --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index bdfba6b602aecb5..d607c6e822a9405 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -26,6 +26,6 @@
 @Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56]
 
 ; Make sure we fill in alignment gaps correctly.
-; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0};
+; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1};
 @GblU = global {i16, i32, i8} {i16 1543, i32 33752069, i8 1}
 
diff --git a/llvm/test/CodeGen/NVPTX/i128-global.ll b/llvm/test/CodeGen/NVPTX/i128-global.ll
index cdebc65272f8627..c87087484da1f9f 100644
--- a/llvm/test/CodeGen/NVPTX/i128-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-global.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+; CHECK: .visible .global .align 16 .b8 G1[16] = {1};
 @G1 = global i128 1
 
 ; CHECK: .visible .global .align 16 .b8 G2[16];

>From d3c40194819ca56c5baf9fa851c0cfe0a1953f42 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Tue, 23 Jan 2024 00:56:39 +0000
Subject: [PATCH 2/2] address comments

---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     | 16 +++++++-------
 .../CodeGen/NVPTX/global-incomplete-init.ll   | 22 +++++++++++++++++++
 2 files changed, 30 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/global-incomplete-init.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index bccb773b0c5f50d..8c7530e934a2038 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1292,20 +1292,20 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
 
 void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
   unsigned int ptrSize = AP.MAI->getCodePointerSize();
-  // Last contingous sequnce of 0 bytes are unintialized to get default
-  // initialization in ptxas. This avoids ptxas memory consumption for large
-  // aggregates.
-  unsigned int count = size;
+  // Do not emit trailing zero initializers. They will be zero-initialized by
+  // ptxas. This saves on both space requirements for the generated PTX and on
+  // memory use by ptxas.
+  unsigned int InitializerCount = size;
   // TODO: symbols make this harder, but it would still be good to trim trailing
   // 0s for aggs with symbols as well.
   if (numSymbols() == 0)
-    while (count >= 1 && !buffer[count - 1])
-      count--;
+    while (InitializerCount >= 1 && !buffer[InitializerCount - 1])
+      InitializerCount--;
 
-  symbolPosInBuffer.push_back(count);
+  symbolPosInBuffer.push_back(InitializerCount);
   unsigned int nSym = 0;
   unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
-  for (unsigned int pos = 0; pos < count;) {
+  for (unsigned int pos = 0; pos < InitializerCount;) {
     if (pos)
       os << ", ";
     if (pos != nextSymbolPos) {
diff --git a/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
new file mode 100644
index 000000000000000..d9c08b8dad57fff
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+; Make sure the globals constants have trailing zeros properly trimmed
+
+; basic case
+; CHECK-DAG: .b8 A[8] = {3, 4, 0, 0, 5};
+ at A = global [8 x i8] [i8 3, i8 4, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0]
+
+; all-zeros
+; CHECK-DAG: .b8 B[2];
+ at B = global [2 x i8] [i8 0, i8 0]
+
+; all-non-zeros
+; CHECK-DAG: .b8 C[4] = {1, 2, 3, 4};
+ at C = global [4 x i8] [i8 1, i8 2, i8 3, i8 4]
+
+; initializer with a symbol, the last 0 could be default initialized
+; CHECK-DAG: .u8 e = 1;
+; CHECK-DAG: .u64 D[4] = {e, 0, e, 0};
+ at e = addrspace(1) global i8 1
+ at D = addrspace(1) global [4 x ptr addrspace(1)] [ptr addrspace(1) @e, ptr addrspace(1) null, ptr addrspace(1) @e, ptr addrspace(1) null]