[llvm] [NVPTX] use incomplete aggregate initializers (PR #79062)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 22 16:57:19 PST 2024
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/79062
>From 09c54e6ddf67e4e060716480f5fa17fef5246773 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Sat, 13 Jan 2024 00:50:34 +0000
Subject: [PATCH 1/2] [NVPTX] use incomplete aggregate initializers
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 14 ++++++++++++--
llvm/test/CodeGen/NVPTX/globals_init.ll | 2 +-
llvm/test/CodeGen/NVPTX/i128-global.ll | 2 +-
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6a03c7b0abc3444..bccb773b0c5f50d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1292,10 +1292,20 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
unsigned int ptrSize = AP.MAI->getCodePointerSize();
- symbolPosInBuffer.push_back(size);
+ // Last contingous sequnce of 0 bytes are unintialized to get default
+ // initialization in ptxas. This avoids ptxas memory consumption for large
+ // aggregates.
+ unsigned int count = size;
+ // TODO: symbols make this harder, but it would still be good to trim trailing
+ // 0s for aggs with symbols as well.
+ if (numSymbols() == 0)
+ while (count >= 1 && !buffer[count - 1])
+ count--;
+
+ symbolPosInBuffer.push_back(count);
unsigned int nSym = 0;
unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
- for (unsigned int pos = 0; pos < size;) {
+ for (unsigned int pos = 0; pos < count;) {
if (pos)
os << ", ";
if (pos != nextSymbolPos) {
diff --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index bdfba6b602aecb5..d607c6e822a9405 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -26,6 +26,6 @@
@Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56]
; Make sure we fill in alignment gaps correctly.
-; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0};
+; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1};
@GblU = global {i16, i32, i8} {i16 1543, i32 33752069, i8 1}
diff --git a/llvm/test/CodeGen/NVPTX/i128-global.ll b/llvm/test/CodeGen/NVPTX/i128-global.ll
index cdebc65272f8627..c87087484da1f9f 100644
--- a/llvm/test/CodeGen/NVPTX/i128-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-global.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+; CHECK: .visible .global .align 16 .b8 G1[16] = {1};
@G1 = global i128 1
; CHECK: .visible .global .align 16 .b8 G2[16];
>From d3c40194819ca56c5baf9fa851c0cfe0a1953f42 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Tue, 23 Jan 2024 00:56:39 +0000
Subject: [PATCH 2/2] address comments
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 16 +++++++-------
.../CodeGen/NVPTX/global-incomplete-init.ll | 22 +++++++++++++++++++
2 files changed, 30 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index bccb773b0c5f50d..8c7530e934a2038 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1292,20 +1292,20 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
unsigned int ptrSize = AP.MAI->getCodePointerSize();
- // Last contingous sequnce of 0 bytes are unintialized to get default
- // initialization in ptxas. This avoids ptxas memory consumption for large
- // aggregates.
- unsigned int count = size;
+ // Do not emit trailing zero initializers. They will be zero-initialized by
+ // ptxas. This saves on both space requirements for the generated PTX and on
+ // memory use by ptxas.
+ unsigned int InitializerCount = size;
// TODO: symbols make this harder, but it would still be good to trim trailing
// 0s for aggs with symbols as well.
if (numSymbols() == 0)
- while (count >= 1 && !buffer[count - 1])
- count--;
+ while (InitializerCount >= 1 && !buffer[InitializerCount - 1])
+ InitializerCount--;
- symbolPosInBuffer.push_back(count);
+ symbolPosInBuffer.push_back(InitializerCount);
unsigned int nSym = 0;
unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
- for (unsigned int pos = 0; pos < count;) {
+ for (unsigned int pos = 0; pos < InitializerCount;) {
if (pos)
os << ", ";
if (pos != nextSymbolPos) {
diff --git a/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
new file mode 100644
index 000000000000000..d9c08b8dad57fff
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+; Make sure the globals constants have trailing zeros properly trimmed
+
+; basic case
+; CHECK-DAG: .b8 A[8] = {3, 4, 0, 0, 5};
+ at A = global [8 x i8] [i8 3, i8 4, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0]
+
+; all-zeros
+; CHECK-DAG: .b8 B[2];
+ at B = global [2 x i8] [i8 0, i8 0]
+
+; all-non-zeros
+; CHECK-DAG: .b8 C[4] = {1, 2, 3, 4};
+ at C = global [4 x i8] [i8 1, i8 2, i8 3, i8 4]
+
+; initializer with a symbol, the last 0 could be default initialized
+; CHECK-DAG: .u8 e = 1;
+; CHECK-DAG: .u64 D[4] = {e, 0, e, 0};
+ at e = addrspace(1) global i8 1
+ at D = addrspace(1) global [4 x ptr addrspace(1)] [ptr addrspace(1) @e, ptr addrspace(1) null, ptr addrspace(1) @e, ptr addrspace(1) null]
More information about the llvm-commits
mailing list