[llvm] [NVPTX] Merge consecutive elements while buffering constant vectors with sub-byte datatype. (PR #183628)
Karthik Senthil via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 3 15:27:55 PDT 2026
https://github.com/karthik-senthil updated https://github.com/llvm/llvm-project/pull/183628
>From 2d2b55740a2047c77f0365b6001ae4f7993611af Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Wed, 25 Feb 2026 22:59:07 +0000
Subject: [PATCH 1/7] [NVPTX] Merge consecutive elements while buffering
constant vectors with sub-byte datatype.
NVPTXAsmPrinter currently buffers one element at a time while generating PTX
for constant global vectors. This causes an assertion when dealing with
sub-byte datatype (like i4) since we allocate lesser buffer size. This PR
fixes the printer to merge consecutive elements to form a full byte before
buffering.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 52 ++++++++++++++++++-
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h | 3 ++
.../NVPTX/sub-byte-constant-vectors.ll | 13 +++++
3 files changed, 66 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b18d6239f0026..5a9e248a484cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1707,13 +1707,19 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
}
}
- // Old constants
- if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+ // Buffer arrays one element at a time.
+ if (isa<ConstantArray>(CPV)) {
for (const auto &Op : CPV->operands())
bufferLEByte(cast<Constant>(Op), 0, aggBuffer);
return;
}
+ // Constant vectors
+ if (const auto *CVec = dyn_cast<ConstantVector>(CPV)) {
+ bufferAggregateConstVec(CVec, aggBuffer);
+ return;
+ }
+
if (const auto *CDS = dyn_cast<ConstantDataSequential>(CPV)) {
for (unsigned I : llvm::seq(CDS->getNumElements()))
bufferLEByte(cast<Constant>(CDS->getElementAsConstant(I)), 0, aggBuffer);
@@ -1737,6 +1743,48 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
llvm_unreachable("unsupported constant type in printAggregateConstant()");
}
+void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
+ AggBuffer *aggBuffer) {
+ unsigned NumElems = CV->getType()->getNumElements();
+ const unsigned BuffSize = aggBuffer->getSize();
+
+ // Buffer one element at a time if we have allocated enough buffer space.
+ if (BuffSize >= NumElems) {
+ for (const auto &Op : CV->operands())
+ bufferLEByte(cast<Constant>(Op), 0, aggBuffer);
+ return;
+ }
+
+ // We have more elements than allocated buffer space, this implies sub-byte
+ // datatype in the vector. Merge consecutive elements to form a full byte.
+ Type *ElemTy = CV->getType()->getElementType();
+ assert(ElemTy->isIntegerTy() && "Expected integer data type.");
+ unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
+ assert(ElemTySize < 8 && "Expected sub-byte data type.");
+ // Number of elements to merge to form a full byte.
+ unsigned ChunkSize = 8 / ElemTySize;
+
+ // Iterate through elements of vector one chunk at a time and buffer that
+ // chunk.
+ for (unsigned I = 0; I < NumElems; I += ChunkSize) {
+ unsigned CurrVal = 0;
+ for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J) {
+ auto *Elem = cast<ConstantInt>(CV->getAggregateElement(J));
+ unsigned ElemVal = Elem->getZExtValue();
+ // PTX datalayout is little-endian, so shift element based on its position
+ // in the chunk.
+ unsigned ShiftAmount = (J - I) * ElemTySize;
+ CurrVal |= ElemVal << ShiftAmount;
+ }
+
+ // Create a new constant to represent the merged value of the chunk and
+ // buffer it.
+ auto *MergedElem =
+ ConstantInt::get(Type::getInt8Ty(ElemTy->getContext()), CurrVal);
+ bufferLEByte(MergedElem, 0, aggBuffer);
+ }
+}
+
/// lowerConstantForGV - Return an MCExpr for the given Constant. This is mostly
/// a copy from AsmPrinter::lowerConstant, except customized to only handle
/// expressions that are representable in PTX and create
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index ebdfac93c04f4..adede1689cc01 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -109,6 +109,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
: size(size), buffer(size), curpos(0), AP(AP),
EmitGeneric(AP.EmitGeneric) {}
+ unsigned getSize() const { return size; }
+
// Copy Num bytes from Ptr.
// if Bytes > Num, zero fill up to Bytes.
void addBytes(const unsigned char *Ptr, unsigned Num, unsigned Bytes) {
@@ -220,6 +222,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
void printFPConstant(const ConstantFP *Fp, raw_ostream &O) const;
void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
+ void bufferAggregateConstVec(const ConstantVector *CV, AggBuffer *aggBuffer);
void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
void emitDeclarations(const Module &, raw_ostream &O);
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
new file mode 100644
index 0000000000000..9ab9d05d121bf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+; CHECK: .visible .global .align 1 .b8 test0[1] = {33};
+ at test0 = local_unnamed_addr addrspace(1) constant <2 x i4> <i4 1, i4 2>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test1[2] = {33, 3};
+ at test1 = local_unnamed_addr addrspace(1) constant <3 x i4> <i4 1, i4 2, i4 3>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test2[1] = {228};
+ at test2 = local_unnamed_addr addrspace(1) constant <4 x i2> <i2 0, i2 1, i2 2, i2 3>, align 1
>From 469541a712a56fcabdce9405c96135345f11e8ca Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Thu, 26 Feb 2026 23:06:38 +0000
Subject: [PATCH 2/7] Use ConstantFolding instead of manually shifting bits.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 36 +++++++++++++++--------
1 file changed, 23 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5a9e248a484cc..ba63fdafe6d1d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1761,26 +1761,36 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
assert(ElemTy->isIntegerTy() && "Expected integer data type.");
unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
assert(ElemTySize < 8 && "Expected sub-byte data type.");
+ assert(8 % ElemTySize == 0 && "Element type size must evenly divide a byte.");
// Number of elements to merge to form a full byte.
unsigned ChunkSize = 8 / ElemTySize;
// Iterate through elements of vector one chunk at a time and buffer that
// chunk.
for (unsigned I = 0; I < NumElems; I += ChunkSize) {
- unsigned CurrVal = 0;
- for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J) {
- auto *Elem = cast<ConstantInt>(CV->getAggregateElement(J));
- unsigned ElemVal = Elem->getZExtValue();
- // PTX datalayout is little-endian, so shift element based on its position
- // in the chunk.
- unsigned ShiftAmount = (J - I) * ElemTySize;
- CurrVal |= ElemVal << ShiftAmount;
- }
+ // Collect elements in chunk to create sub-vector.
+ SmallVector<Constant *, 8> SubCVElems;
+ for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J)
+ SubCVElems.push_back(CV->getAggregateElement(J));
+
+ // For unevenly sized vectors add padding zeros.
+ unsigned PaddingZeroCount = ChunkSize - SubCVElems.size();
+ for (unsigned I = 0; I < PaddingZeroCount; ++I)
+ SubCVElems.push_back(ConstantInt::getNullValue(ElemTy));
+
+ auto SubCV = ConstantVector::get(SubCVElems);
+ Type *Int8Ty = IntegerType::get(CV->getContext(), 8);
+
+ // Merge elements of the chunk using ConstantFolding and buffer it.
+ ConstantInt *MergedElem =
+ dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
+ ConstantExpr::getBitCast(const_cast<Constant *>(SubCV), Int8Ty),
+ getDataLayout()));
+
+ if (!MergedElem)
+ report_fatal_error(
+ "Cannot lower vector global with unusual element type");
- // Create a new constant to represent the merged value of the chunk and
- // buffer it.
- auto *MergedElem =
- ConstantInt::get(Type::getInt8Ty(ElemTy->getContext()), CurrVal);
bufferLEByte(MergedElem, 0, aggBuffer);
}
}
>From 33803f749762519ee92b44741524ee7d840d5933 Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Tue, 3 Mar 2026 18:29:43 +0000
Subject: [PATCH 3/7] Address review comments.
- Rename test, add new test to check conversion of 2xi4 vector.
- Add comment to clarify current use-case of changes.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 4 ++-
.../NVPTX/sub-byte-constant-vector-convert.ll | 32 +++++++++++++++++++
....ll => sub-byte-constant-vectors-i4-i2.ll} | 3 ++
3 files changed, 38 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll
rename llvm/test/CodeGen/NVPTX/{sub-byte-constant-vectors.ll => sub-byte-constant-vectors-i4-i2.ll} (83%)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ba63fdafe6d1d..528b7f7803e4f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1756,7 +1756,9 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
}
// We have more elements than allocated buffer space, this implies sub-byte
- // datatype in the vector. Merge consecutive elements to form a full byte.
+ // datatype in the vector. Merge consecutive elements to form a full byte. We
+ // expect that 8 % sub-byte-elem-size should be 0 and current expected usage
+ // is for i4 (for e2m1-fp4 types).
Type *ElemTy = CV->getType()->getElementType();
assert(ElemTy->isIntegerTy() && "Expected integer data type.");
unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll
new file mode 100644
index 0000000000000..90bf9afc8da0a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; Test to verify that NVPTX backend correctly handles conversion of constant
+; global vectors containing sub-byte sized elements.
+
+; RUN: llc < %s -O0 -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s
+; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -O0 -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx-nvidia-cuda"
+
+; CHECK: .visible .global .align 1 .b8 test0[1] = {33};
+ at test0 = local_unnamed_addr addrspace(1) constant <2 x i4> <i4 1, i4 2>, align 1
+
+define <2 x half> @foo() {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.global.b8 %rs1, [test0];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b8 %e2m1x2_in;
+; CHECK-NEXT: cvt.u8.u16 %e2m1x2_in, %rs1;
+; CHECK-NEXT: cvt.rn.f16x2.e2m1x2 %r1, %e2m1x2_in;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %ld = load i8, ptr addrspace(1) @test0, align 1
+ %in = zext nneg i8 %ld to i16
+ %val = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 %in)
+ ret <2 x half> %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
similarity index 83%
rename from llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
rename to llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
index 9ab9d05d121bf..27e7a0f220f69 100644
--- a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
@@ -1,3 +1,6 @@
+; Test to verify that NVPTX backend correctly handles constant global vectors
+; containing sub-byte sized elements.
+
; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
>From 3fcf18fb8dc56e6059074330c13c9dce8340d5fc Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Thu, 2 Apr 2026 23:02:38 +0000
Subject: [PATCH 4/7] Address review comments.
- Update comments.
- Separate chunk processing from tail padding logic.
- Rename getSize to getBufferSize.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 72 ++++++++++++-------
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h | 10 +--
.../NVPTX/sub-byte-constant-vectors-i4-i2.ll | 6 ++
3 files changed, 59 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 528b7f7803e4f..392cc050ce64d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1115,7 +1115,7 @@ void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
// ptxas. This saves on both space requirements for the generated PTX and on
// memory use by ptxas. (See:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#global-state-space)
- unsigned int InitializerCount = size;
+ unsigned int InitializerCount = Size;
// TODO: symbols make this harder, but it would still be good to trim trailing
// 0s for aggs with symbols as well.
if (numSymbols() == 0)
@@ -1153,11 +1153,11 @@ void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) {
unsigned int ptrSize = AP.MAI->getCodePointerSize();
- symbolPosInBuffer.push_back(size);
+ symbolPosInBuffer.push_back(Size);
unsigned int nSym = 0;
unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
assert(nextSymbolPos % ptrSize == 0);
- for (unsigned int pos = 0; pos < size; pos += ptrSize) {
+ for (unsigned int pos = 0; pos < Size; pos += ptrSize) {
if (pos)
os << ", ";
if (pos == nextSymbolPos) {
@@ -1746,7 +1746,7 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
AggBuffer *aggBuffer) {
unsigned NumElems = CV->getType()->getNumElements();
- const unsigned BuffSize = aggBuffer->getSize();
+ const unsigned BuffSize = aggBuffer->getBufferSize();
// Buffer one element at a time if we have allocated enough buffer space.
if (BuffSize >= NumElems) {
@@ -1755,10 +1755,10 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
return;
}
- // We have more elements than allocated buffer space, this implies sub-byte
- // datatype in the vector. Merge consecutive elements to form a full byte. We
- // expect that 8 % sub-byte-elem-size should be 0 and current expected usage
- // is for i4 (for e2m1-fp4 types).
+ // Sub-byte datatypes will have more elements than bytes allocated for the
+ // buffer. Merge consecutive elements to form a full byte. We expect that 8 %
+ // sub-byte-elem-size should be 0 and current expected usage is for i4 (for
+ // e2m1-fp4 types).
Type *ElemTy = CV->getType()->getElementType();
assert(ElemTy->isIntegerTy() && "Expected integer data type.");
unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
@@ -1767,23 +1767,12 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
// Number of elements to merge to form a full byte.
unsigned ChunkSize = 8 / ElemTySize;
- // Iterate through elements of vector one chunk at a time and buffer that
- // chunk.
- for (unsigned I = 0; I < NumElems; I += ChunkSize) {
- // Collect elements in chunk to create sub-vector.
- SmallVector<Constant *, 8> SubCVElems;
- for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J)
- SubCVElems.push_back(CV->getAggregateElement(J));
-
- // For unevenly sized vectors add padding zeros.
- unsigned PaddingZeroCount = ChunkSize - SubCVElems.size();
- for (unsigned I = 0; I < PaddingZeroCount; ++I)
- SubCVElems.push_back(ConstantInt::getNullValue(ElemTy));
-
+ // Helper lambda to constant-fold array of sub-byte constants to i8.
+ auto ConvertSubCVtoInt8 = [this](ArrayRef<Constant *> SubCVElems) {
auto SubCV = ConstantVector::get(SubCVElems);
- Type *Int8Ty = IntegerType::get(CV->getContext(), 8);
+ Type *Int8Ty = IntegerType::get(SubCV->getContext(), 8);
- // Merge elements of the chunk using ConstantFolding and buffer it.
+ // Merge elements of the chunk using ConstantFolding.
ConstantInt *MergedElem =
dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
ConstantExpr::getBitCast(const_cast<Constant *>(SubCV), Int8Ty),
@@ -1793,7 +1782,42 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
report_fatal_error(
"Cannot lower vector global with unusual element type");
- bufferLEByte(MergedElem, 0, aggBuffer);
+ return MergedElem;
+ };
+
+ // Iterate through elements of vector one chunk at a time and buffer that
+ // chunk.
+ unsigned TailPaddedChunkBegin = 0;
+ bool NeedTailPadding = false;
+ for (unsigned I = 0; I < NumElems; I += ChunkSize) {
+ // If we have less elements than chunk size, then break to pad the final
+ // chunk with zeroes.
+ if (NumElems - I < ChunkSize) {
+ TailPaddedChunkBegin = I;
+ NeedTailPadding = true;
+ break;
+ }
+
+ // Collect elements in chunk to create sub-vector.
+ SmallVector<Constant *, 8> SubCVElems;
+ for (unsigned J = I; J < ChunkSize; ++J)
+ SubCVElems.push_back(CV->getAggregateElement(J));
+
+ // Buffer merged element.
+ bufferLEByte(ConvertSubCVtoInt8(SubCVElems), 0, aggBuffer);
+ }
+
+ // For unevenly sized vectors add tail padding zeros.
+ if (NeedTailPadding) {
+ SmallVector<Constant *, 8> TailPaddedElems;
+ for (unsigned I = TailPaddedChunkBegin; I < NumElems; ++I)
+ TailPaddedElems.push_back(CV->getAggregateElement(I));
+
+ unsigned NumPaddingZeros = ChunkSize - (NumElems - TailPaddedChunkBegin);
+ TailPaddedElems.append(
+ SmallVector(NumPaddingZeros, ConstantInt::getNullValue(ElemTy)));
+
+ bufferLEByte(ConvertSubCVtoInt8(TailPaddedElems), 0, aggBuffer);
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index adede1689cc01..f95c4c3299cb8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -88,7 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
}
private:
- const unsigned size; // size of the buffer in bytes
+ const unsigned Size; // size of the buffer in bytes
std::vector<unsigned char> buffer; // the buffer
SmallVector<unsigned, 4> symbolPosInBuffer;
SmallVector<const Value *, 4> Symbols;
@@ -105,11 +105,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
const bool EmitGeneric;
public:
- AggBuffer(unsigned size, const NVPTXAsmPrinter &AP)
- : size(size), buffer(size), curpos(0), AP(AP),
+ AggBuffer(unsigned Size, const NVPTXAsmPrinter &AP)
+ : Size(Size), buffer(Size), curpos(0), AP(AP),
EmitGeneric(AP.EmitGeneric) {}
- unsigned getSize() const { return size; }
+ unsigned getBufferSize() const { return Size; }
// Copy Num bytes from Ptr.
// if Bytes > Num, zero fill up to Bytes.
@@ -121,7 +121,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
}
void addByte(uint8_t Byte) {
- assert(curpos < size);
+ assert(curpos < Size);
buffer[curpos] = Byte;
curpos++;
}
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
index 27e7a0f220f69..9b59bdbd438d8 100644
--- a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
@@ -14,3 +14,9 @@ target triple = "nvptx-nvidia-cuda"
; CHECK: .visible .global .align 1 .b8 test2[1] = {228};
@test2 = local_unnamed_addr addrspace(1) constant <4 x i2> <i2 0, i2 1, i2 2, i2 3>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test3[2] = {228, 4};
+ at test3 = local_unnamed_addr addrspace(1) constant <6 x i2> <i2 0, i2 1, i2 2, i2 3, i2 0, i2 1>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test4[1] = {1};
+ at test4 = local_unnamed_addr addrspace(1) constant <1 x i4> <i4 1>, align 1
>From 6677b6cbd1d7252d2aca06a0dc2551986fb16eae Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Fri, 3 Apr 2026 00:12:06 +0000
Subject: [PATCH 5/7] Address review comments.
- Simplify logic to process chunks and tail padding.
- Fold element retrieval and tail padding zeros into helper lambda.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 62 +++++++++++------------
1 file changed, 30 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 392cc050ce64d..25f95a2fbdcf1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1766,13 +1766,31 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
assert(8 % ElemTySize == 0 && "Element type size must evenly divide a byte.");
// Number of elements to merge to form a full byte.
unsigned ChunkSize = 8 / ElemTySize;
+ unsigned NumChunks = NumElems / ChunkSize;
+ unsigned NumTailElems = NumElems % ChunkSize;
+
+ // Helper lambda to constant-fold sub-vector of sub-byte type elements into
+ // i8. Start and end indices of the sub-vector is provided, along with number
+ // of padding zeros if required.
+ auto ConvertSubCVtoInt8 = [this, &ElemTy](const ConstantVector *CV,
+ unsigned Start, unsigned End,
+ unsigned NumPaddingZeros) {
+ // Collect elements to create sub-vector.
+ SmallVector<Constant *, 8> SubCVElems;
+ for (unsigned I = Start; I < End; ++I)
+ SubCVElems.push_back(CV->getAggregateElement(I));
+
+ // Optionally pad with zeros.
+ if (NumPaddingZeros) {
+ SmallVector<Constant *, 8> Zeros(NumPaddingZeros,
+ ConstantInt::getNullValue(ElemTy));
+ SubCVElems.append(Zeros);
+ }
- // Helper lambda to constant-fold array of sub-byte constants to i8.
- auto ConvertSubCVtoInt8 = [this](ArrayRef<Constant *> SubCVElems) {
auto SubCV = ConstantVector::get(SubCVElems);
Type *Int8Ty = IntegerType::get(SubCV->getContext(), 8);
- // Merge elements of the chunk using ConstantFolding.
+ // Merge elements of the sub-vector using ConstantFolding.
ConstantInt *MergedElem =
dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
ConstantExpr::getBitCast(const_cast<Constant *>(SubCV), Int8Ty),
@@ -1787,37 +1805,17 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
// Iterate through elements of vector one chunk at a time and buffer that
// chunk.
- unsigned TailPaddedChunkBegin = 0;
- bool NeedTailPadding = false;
- for (unsigned I = 0; I < NumElems; I += ChunkSize) {
- // If we have less elements than chunk size, then break to pad the final
- // chunk with zeroes.
- if (NumElems - I < ChunkSize) {
- TailPaddedChunkBegin = I;
- NeedTailPadding = true;
- break;
- }
-
- // Collect elements in chunk to create sub-vector.
- SmallVector<Constant *, 8> SubCVElems;
- for (unsigned J = I; J < ChunkSize; ++J)
- SubCVElems.push_back(CV->getAggregateElement(J));
-
- // Buffer merged element.
- bufferLEByte(ConvertSubCVtoInt8(SubCVElems), 0, aggBuffer);
- }
+ for (unsigned I = 0; I < NumChunks; ++I)
+ bufferLEByte(
+ ConvertSubCVtoInt8(CV, I, I + ChunkSize, 0 /*NumPaddingZeros*/), 0,
+ aggBuffer);
// For unevenly sized vectors add tail padding zeros.
- if (NeedTailPadding) {
- SmallVector<Constant *, 8> TailPaddedElems;
- for (unsigned I = TailPaddedChunkBegin; I < NumElems; ++I)
- TailPaddedElems.push_back(CV->getAggregateElement(I));
-
- unsigned NumPaddingZeros = ChunkSize - (NumElems - TailPaddedChunkBegin);
- TailPaddedElems.append(
- SmallVector(NumPaddingZeros, ConstantInt::getNullValue(ElemTy)));
-
- bufferLEByte(ConvertSubCVtoInt8(TailPaddedElems), 0, aggBuffer);
+ if (NumTailElems > 0) {
+ unsigned TailStart = NumElems - NumTailElems;
+ unsigned NumPaddingZeros = ChunkSize - NumTailElems;
+ bufferLEByte(ConvertSubCVtoInt8(CV, TailStart, NumElems, NumPaddingZeros),
+ 0, aggBuffer);
}
}
>From 44ef6a09d4f64b694695be70862ac7befda6babb Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Fri, 3 Apr 2026 21:44:00 +0000
Subject: [PATCH 6/7] Address review comments.
- Rename variables to avoid "chunk" terminology.
- Avoid temp array while tail padding zeros.
- Fold single use variables into call-site.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 29 +++++++++--------------
1 file changed, 11 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 25f95a2fbdcf1..f193c122dd994 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1765,27 +1765,24 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
assert(ElemTySize < 8 && "Expected sub-byte data type.");
assert(8 % ElemTySize == 0 && "Element type size must evenly divide a byte.");
// Number of elements to merge to form a full byte.
- unsigned ChunkSize = 8 / ElemTySize;
- unsigned NumChunks = NumElems / ChunkSize;
- unsigned NumTailElems = NumElems % ChunkSize;
+ unsigned NumElemsPerByte = 8 / ElemTySize;
+ unsigned NumCompleteBytes = NumElems / NumElemsPerByte;
+ unsigned NumTailElems = NumElems % NumElemsPerByte;
// Helper lambda to constant-fold sub-vector of sub-byte type elements into
// i8. Start and end indices of the sub-vector is provided, along with number
// of padding zeros if required.
auto ConvertSubCVtoInt8 = [this, &ElemTy](const ConstantVector *CV,
unsigned Start, unsigned End,
- unsigned NumPaddingZeros) {
+ unsigned NumPaddingZeros = 0) {
// Collect elements to create sub-vector.
SmallVector<Constant *, 8> SubCVElems;
for (unsigned I = Start; I < End; ++I)
SubCVElems.push_back(CV->getAggregateElement(I));
// Optionally pad with zeros.
- if (NumPaddingZeros) {
- SmallVector<Constant *, 8> Zeros(NumPaddingZeros,
- ConstantInt::getNullValue(ElemTy));
- SubCVElems.append(Zeros);
- }
+ for (auto _ : llvm::seq(NumPaddingZeros))
+ SubCVElems.push_back(ConstantInt::getNullValue(ElemTy));
auto SubCV = ConstantVector::get(SubCVElems);
Type *Int8Ty = IntegerType::get(SubCV->getContext(), 8);
@@ -1805,18 +1802,14 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
// Iterate through elements of vector one chunk at a time and buffer that
// chunk.
- for (unsigned I = 0; I < NumChunks; ++I)
- bufferLEByte(
- ConvertSubCVtoInt8(CV, I, I + ChunkSize, 0 /*NumPaddingZeros*/), 0,
- aggBuffer);
+ for (unsigned I = 0; I < NumCompleteBytes; ++I)
+ bufferLEByte(ConvertSubCVtoInt8(CV, I, I + NumElemsPerByte), 0, aggBuffer);
// For unevenly sized vectors add tail padding zeros.
- if (NumTailElems > 0) {
- unsigned TailStart = NumElems - NumTailElems;
- unsigned NumPaddingZeros = ChunkSize - NumTailElems;
- bufferLEByte(ConvertSubCVtoInt8(CV, TailStart, NumElems, NumPaddingZeros),
+ if (NumTailElems > 0)
+ bufferLEByte(ConvertSubCVtoInt8(CV, NumElems - NumTailElems, NumElems,
+ NumElemsPerByte - NumTailElems),
0, aggBuffer);
- }
}
/// lowerConstantForGV - Return an MCExpr for the given Constant. This is mostly
>From 95ce18b8849da007282d3bca66db5a14968d749c Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Fri, 3 Apr 2026 22:24:36 +0000
Subject: [PATCH 7/7] Address review comment.
- Update loops over integer sequence to use llvm::seq for consistency.
---
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index f193c122dd994..942ebb88307e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1777,7 +1777,7 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
unsigned NumPaddingZeros = 0) {
// Collect elements to create sub-vector.
SmallVector<Constant *, 8> SubCVElems;
- for (unsigned I = Start; I < End; ++I)
+ for (unsigned I : llvm::seq(Start, End))
SubCVElems.push_back(CV->getAggregateElement(I));
// Optionally pad with zeros.
@@ -1802,7 +1802,7 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
// Iterate through elements of vector one chunk at a time and buffer that
// chunk.
- for (unsigned I = 0; I < NumCompleteBytes; ++I)
+ for (unsigned I : llvm::seq(NumCompleteBytes))
bufferLEByte(ConvertSubCVtoInt8(CV, I, I + NumElemsPerByte), 0, aggBuffer);
// For unevenly sized vectors add tail padding zeros.
More information about the llvm-commits
mailing list