[llvm] [NVPTX] Merge consecutive elements while buffering constant vectors with sub-byte datatype. (PR #183628)

Fri Apr 3 15:27:55 PDT 2026

https://github.com/karthik-senthil updated https://github.com/llvm/llvm-project/pull/183628

>From 2d2b55740a2047c77f0365b6001ae4f7993611af Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Wed, 25 Feb 2026 22:59:07 +0000
Subject: [PATCH 1/7] [NVPTX] Merge consecutive elements while buffering
 constant vectors with sub-byte datatype.

NVPTXAsmPrinter currently buffers one element at a time while generating PTX
for constant global vectors. This causes an assertion when dealing with
sub-byte datatype (like i4) since we allocate lesser buffer size. This PR
fixes the printer to merge consecutive elements to form a full byte before
buffering.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     | 52 ++++++++++++++++++-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h       |  3 ++
 .../NVPTX/sub-byte-constant-vectors.ll        | 13 +++++
 3 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b18d6239f0026..5a9e248a484cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1707,13 +1707,19 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
     }
   }
 
-  // Old constants
-  if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+  // Buffer arrays one element at a time.
+  if (isa<ConstantArray>(CPV)) {
     for (const auto &Op : CPV->operands())
       bufferLEByte(cast<Constant>(Op), 0, aggBuffer);
     return;
   }
 
+  // Constant vectors
+  if (const auto *CVec = dyn_cast<ConstantVector>(CPV)) {
+    bufferAggregateConstVec(CVec, aggBuffer);
+    return;
+  }
+
   if (const auto *CDS = dyn_cast<ConstantDataSequential>(CPV)) {
     for (unsigned I : llvm::seq(CDS->getNumElements()))
       bufferLEByte(cast<Constant>(CDS->getElementAsConstant(I)), 0, aggBuffer);
@@ -1737,6 +1743,48 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
   llvm_unreachable("unsupported constant type in printAggregateConstant()");
 }
 
+void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
+                                              AggBuffer *aggBuffer) {
+  unsigned NumElems = CV->getType()->getNumElements();
+  const unsigned BuffSize = aggBuffer->getSize();
+
+  // Buffer one element at a time if we have allocated enough buffer space.
+  if (BuffSize >= NumElems) {
+    for (const auto &Op : CV->operands())
+      bufferLEByte(cast<Constant>(Op), 0, aggBuffer);
+    return;
+  }
+
+  // We have more elements than allocated buffer space, this implies sub-byte
+  // datatype in the vector. Merge consecutive elements to form a full byte.
+  Type *ElemTy = CV->getType()->getElementType();
+  assert(ElemTy->isIntegerTy() && "Expected integer data type.");
+  unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
+  assert(ElemTySize < 8 && "Expected sub-byte data type.");
+  // Number of elements to merge to form a full byte.
+  unsigned ChunkSize = 8 / ElemTySize;
+
+  // Iterate through elements of vector one chunk at a time and buffer that
+  // chunk.
+  for (unsigned I = 0; I < NumElems; I += ChunkSize) {
+    unsigned CurrVal = 0;
+    for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J) {
+      auto *Elem = cast<ConstantInt>(CV->getAggregateElement(J));
+      unsigned ElemVal = Elem->getZExtValue();
+      // PTX datalayout is little-endian, so shift element based on its position
+      // in the chunk.
+      unsigned ShiftAmount = (J - I) * ElemTySize;
+      CurrVal |= ElemVal << ShiftAmount;
+    }
+
+    // Create a new constant to represent the merged value of the chunk and
+    // buffer it.
+    auto *MergedElem =
+        ConstantInt::get(Type::getInt8Ty(ElemTy->getContext()), CurrVal);
+    bufferLEByte(MergedElem, 0, aggBuffer);
+  }
+}
+
 /// lowerConstantForGV - Return an MCExpr for the given Constant.  This is mostly
 /// a copy from AsmPrinter::lowerConstant, except customized to only handle
 /// expressions that are representable in PTX and create
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index ebdfac93c04f4..adede1689cc01 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -109,6 +109,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
         : size(size), buffer(size), curpos(0), AP(AP),
           EmitGeneric(AP.EmitGeneric) {}
 
+    unsigned getSize() const { return size; }
+
     // Copy Num bytes from Ptr.
     // if Bytes > Num, zero fill up to Bytes.
     void addBytes(const unsigned char *Ptr, unsigned Num, unsigned Bytes) {
@@ -220,6 +222,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
   void printFPConstant(const ConstantFP *Fp, raw_ostream &O) const;
   void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
   void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
+  void bufferAggregateConstVec(const ConstantVector *CV, AggBuffer *aggBuffer);
 
   void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
   void emitDeclarations(const Module &, raw_ostream &O);
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
new file mode 100644
index 0000000000000..9ab9d05d121bf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+; CHECK: .visible .global .align 1 .b8 test0[1] = {33};
+ at test0 = local_unnamed_addr addrspace(1) constant <2 x i4> <i4 1, i4 2>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test1[2] = {33, 3};
+ at test1 = local_unnamed_addr addrspace(1) constant <3 x i4> <i4 1, i4 2, i4 3>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test2[1] = {228};
+ at test2 = local_unnamed_addr addrspace(1) constant <4 x i2> <i2 0, i2 1, i2 2, i2 3>, align 1

>From 469541a712a56fcabdce9405c96135345f11e8ca Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Thu, 26 Feb 2026 23:06:38 +0000
Subject: [PATCH 2/7] Use ConstantFolding instead of manually shifting bits.

---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 36 +++++++++++++++--------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5a9e248a484cc..ba63fdafe6d1d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1761,26 +1761,36 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
   assert(ElemTy->isIntegerTy() && "Expected integer data type.");
   unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
   assert(ElemTySize < 8 && "Expected sub-byte data type.");
+  assert(8 % ElemTySize == 0 && "Element type size must evenly divide a byte.");
   // Number of elements to merge to form a full byte.
   unsigned ChunkSize = 8 / ElemTySize;
 
   // Iterate through elements of vector one chunk at a time and buffer that
   // chunk.
   for (unsigned I = 0; I < NumElems; I += ChunkSize) {
-    unsigned CurrVal = 0;
-    for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J) {
-      auto *Elem = cast<ConstantInt>(CV->getAggregateElement(J));
-      unsigned ElemVal = Elem->getZExtValue();
-      // PTX datalayout is little-endian, so shift element based on its position
-      // in the chunk.
-      unsigned ShiftAmount = (J - I) * ElemTySize;
-      CurrVal |= ElemVal << ShiftAmount;
-    }
+    // Collect elements in chunk to create sub-vector.
+    SmallVector<Constant *, 8> SubCVElems;
+    for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J)
+      SubCVElems.push_back(CV->getAggregateElement(J));
+
+    // For unevenly sized vectors add padding zeros.
+    unsigned PaddingZeroCount = ChunkSize - SubCVElems.size();
+    for (unsigned I = 0; I < PaddingZeroCount; ++I)
+      SubCVElems.push_back(ConstantInt::getNullValue(ElemTy));
+
+    auto SubCV = ConstantVector::get(SubCVElems);
+    Type *Int8Ty = IntegerType::get(CV->getContext(), 8);
+
+    // Merge elements of the chunk using ConstantFolding and buffer it.
+    ConstantInt *MergedElem =
+        dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
+            ConstantExpr::getBitCast(const_cast<Constant *>(SubCV), Int8Ty),
+            getDataLayout()));
+
+    if (!MergedElem)
+      report_fatal_error(
+          "Cannot lower vector global with unusual element type");
 
-    // Create a new constant to represent the merged value of the chunk and
-    // buffer it.
-    auto *MergedElem =
-        ConstantInt::get(Type::getInt8Ty(ElemTy->getContext()), CurrVal);
     bufferLEByte(MergedElem, 0, aggBuffer);
   }
 }

>From 33803f749762519ee92b44741524ee7d840d5933 Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Tue, 3 Mar 2026 18:29:43 +0000
Subject: [PATCH 3/7] Address review comments.

- Rename test, add new test to check conversion of 2xi4 vector.
- Add comment to clarify current use-case of changes.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     |  4 ++-
 .../NVPTX/sub-byte-constant-vector-convert.ll | 32 +++++++++++++++++++
 ....ll => sub-byte-constant-vectors-i4-i2.ll} |  3 ++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll
 rename llvm/test/CodeGen/NVPTX/{sub-byte-constant-vectors.ll => sub-byte-constant-vectors-i4-i2.ll} (83%)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ba63fdafe6d1d..528b7f7803e4f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1756,7 +1756,9 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
   }
 
   // We have more elements than allocated buffer space, this implies sub-byte
-  // datatype in the vector. Merge consecutive elements to form a full byte.
+  // datatype in the vector. Merge consecutive elements to form a full byte. We
+  // expect that 8 % sub-byte-elem-size should be 0 and current expected usage
+  // is for i4 (for e2m1-fp4 types).
   Type *ElemTy = CV->getType()->getElementType();
   assert(ElemTy->isIntegerTy() && "Expected integer data type.");
   unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll
new file mode 100644
index 0000000000000..90bf9afc8da0a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vector-convert.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; Test to verify that NVPTX backend correctly handles conversion of constant
+; global vectors containing sub-byte sized elements.
+
+; RUN: llc < %s -O0 -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s
+; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -O0 -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx-nvidia-cuda"
+
+; CHECK: .visible .global .align 1 .b8 test0[1] = {33};
+ at test0 = local_unnamed_addr addrspace(1) constant <2 x i4> <i4 1, i4 2>, align 1
+
+define <2 x half> @foo() {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.global.b8 %rs1, [test0];
+; CHECK-NEXT:    {
+; CHECK-NEXT:    .reg .b8 %e2m1x2_in;
+; CHECK-NEXT:    cvt.u8.u16 %e2m1x2_in, %rs1;
+; CHECK-NEXT:    cvt.rn.f16x2.e2m1x2 %r1, %e2m1x2_in;
+; CHECK-NEXT:    }
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+    %ld = load i8, ptr addrspace(1) @test0, align 1
+    %in = zext nneg i8 %ld to i16
+    %val = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 %in)
+    ret <2 x half> %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
similarity index 83%
rename from llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
rename to llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
index 9ab9d05d121bf..27e7a0f220f69 100644
--- a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
@@ -1,3 +1,6 @@
+; Test to verify that NVPTX backend correctly handles constant global vectors
+; containing sub-byte sized elements.
+
 ; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
 

>From 3fcf18fb8dc56e6059074330c13c9dce8340d5fc Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Thu, 2 Apr 2026 23:02:38 +0000
Subject: [PATCH 4/7] Address review comments.

- Update comments.
- Separate chunk processing from tail padding logic.
- Rename getSize to getBufferSize.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     | 72 ++++++++++++-------
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h       | 10 +--
 .../NVPTX/sub-byte-constant-vectors-i4-i2.ll  |  6 ++
 3 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 528b7f7803e4f..392cc050ce64d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1115,7 +1115,7 @@ void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
   // ptxas. This saves on both space requirements for the generated PTX and on
   // memory use by ptxas. (See:
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#global-state-space)
-  unsigned int InitializerCount = size;
+  unsigned int InitializerCount = Size;
   // TODO: symbols make this harder, but it would still be good to trim trailing
   // 0s for aggs with symbols as well.
   if (numSymbols() == 0)
@@ -1153,11 +1153,11 @@ void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
 
 void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) {
   unsigned int ptrSize = AP.MAI->getCodePointerSize();
-  symbolPosInBuffer.push_back(size);
+  symbolPosInBuffer.push_back(Size);
   unsigned int nSym = 0;
   unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
   assert(nextSymbolPos % ptrSize == 0);
-  for (unsigned int pos = 0; pos < size; pos += ptrSize) {
+  for (unsigned int pos = 0; pos < Size; pos += ptrSize) {
     if (pos)
       os << ", ";
     if (pos == nextSymbolPos) {
@@ -1746,7 +1746,7 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
 void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
                                               AggBuffer *aggBuffer) {
   unsigned NumElems = CV->getType()->getNumElements();
-  const unsigned BuffSize = aggBuffer->getSize();
+  const unsigned BuffSize = aggBuffer->getBufferSize();
 
   // Buffer one element at a time if we have allocated enough buffer space.
   if (BuffSize >= NumElems) {
@@ -1755,10 +1755,10 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
     return;
   }
 
-  // We have more elements than allocated buffer space, this implies sub-byte
-  // datatype in the vector. Merge consecutive elements to form a full byte. We
-  // expect that 8 % sub-byte-elem-size should be 0 and current expected usage
-  // is for i4 (for e2m1-fp4 types).
+  // Sub-byte datatypes will have more elements than bytes allocated for the
+  // buffer. Merge consecutive elements to form a full byte. We expect that 8 %
+  // sub-byte-elem-size should be 0 and current expected usage is for i4 (for
+  // e2m1-fp4 types).
   Type *ElemTy = CV->getType()->getElementType();
   assert(ElemTy->isIntegerTy() && "Expected integer data type.");
   unsigned ElemTySize = ElemTy->getPrimitiveSizeInBits();
@@ -1767,23 +1767,12 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
   // Number of elements to merge to form a full byte.
   unsigned ChunkSize = 8 / ElemTySize;
 
-  // Iterate through elements of vector one chunk at a time and buffer that
-  // chunk.
-  for (unsigned I = 0; I < NumElems; I += ChunkSize) {
-    // Collect elements in chunk to create sub-vector.
-    SmallVector<Constant *, 8> SubCVElems;
-    for (unsigned J = I; J < std::min(I + ChunkSize, NumElems); ++J)
-      SubCVElems.push_back(CV->getAggregateElement(J));
-
-    // For unevenly sized vectors add padding zeros.
-    unsigned PaddingZeroCount = ChunkSize - SubCVElems.size();
-    for (unsigned I = 0; I < PaddingZeroCount; ++I)
-      SubCVElems.push_back(ConstantInt::getNullValue(ElemTy));
-
+  // Helper lambda to constant-fold array of sub-byte constants to i8.
+  auto ConvertSubCVtoInt8 = [this](ArrayRef<Constant *> SubCVElems) {
     auto SubCV = ConstantVector::get(SubCVElems);
-    Type *Int8Ty = IntegerType::get(CV->getContext(), 8);
+    Type *Int8Ty = IntegerType::get(SubCV->getContext(), 8);
 
-    // Merge elements of the chunk using ConstantFolding and buffer it.
+    // Merge elements of the chunk using ConstantFolding.
     ConstantInt *MergedElem =
         dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
             ConstantExpr::getBitCast(const_cast<Constant *>(SubCV), Int8Ty),
@@ -1793,7 +1782,42 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
       report_fatal_error(
           "Cannot lower vector global with unusual element type");
 
-    bufferLEByte(MergedElem, 0, aggBuffer);
+    return MergedElem;
+  };
+
+  // Iterate through elements of vector one chunk at a time and buffer that
+  // chunk.
+  unsigned TailPaddedChunkBegin = 0;
+  bool NeedTailPadding = false;
+  for (unsigned I = 0; I < NumElems; I += ChunkSize) {
+    // If we have less elements than chunk size, then break to pad the final
+    // chunk with zeroes.
+    if (NumElems - I < ChunkSize) {
+      TailPaddedChunkBegin = I;
+      NeedTailPadding = true;
+      break;
+    }
+
+    // Collect elements in chunk to create sub-vector.
+    SmallVector<Constant *, 8> SubCVElems;
+    for (unsigned J = I; J < ChunkSize; ++J)
+      SubCVElems.push_back(CV->getAggregateElement(J));
+
+    // Buffer merged element.
+    bufferLEByte(ConvertSubCVtoInt8(SubCVElems), 0, aggBuffer);
+  }
+
+  // For unevenly sized vectors add tail padding zeros.
+  if (NeedTailPadding) {
+    SmallVector<Constant *, 8> TailPaddedElems;
+    for (unsigned I = TailPaddedChunkBegin; I < NumElems; ++I)
+      TailPaddedElems.push_back(CV->getAggregateElement(I));
+
+    unsigned NumPaddingZeros = ChunkSize - (NumElems - TailPaddedChunkBegin);
+    TailPaddedElems.append(
+        SmallVector(NumPaddingZeros, ConstantInt::getNullValue(ElemTy)));
+
+    bufferLEByte(ConvertSubCVtoInt8(TailPaddedElems), 0, aggBuffer);
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index adede1689cc01..f95c4c3299cb8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -88,7 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     }
 
   private:
-    const unsigned size;   // size of the buffer in bytes
+    const unsigned Size;               // size of the buffer in bytes
     std::vector<unsigned char> buffer; // the buffer
     SmallVector<unsigned, 4> symbolPosInBuffer;
     SmallVector<const Value *, 4> Symbols;
@@ -105,11 +105,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     const bool EmitGeneric;
 
   public:
-    AggBuffer(unsigned size, const NVPTXAsmPrinter &AP)
-        : size(size), buffer(size), curpos(0), AP(AP),
+    AggBuffer(unsigned Size, const NVPTXAsmPrinter &AP)
+        : Size(Size), buffer(Size), curpos(0), AP(AP),
           EmitGeneric(AP.EmitGeneric) {}
 
-    unsigned getSize() const { return size; }
+    unsigned getBufferSize() const { return Size; }
 
     // Copy Num bytes from Ptr.
     // if Bytes > Num, zero fill up to Bytes.
@@ -121,7 +121,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     }
 
     void addByte(uint8_t Byte) {
-      assert(curpos < size);
+      assert(curpos < Size);
       buffer[curpos] = Byte;
       curpos++;
     }
diff --git a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
index 27e7a0f220f69..9b59bdbd438d8 100644
--- a/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
+++ b/llvm/test/CodeGen/NVPTX/sub-byte-constant-vectors-i4-i2.ll
@@ -14,3 +14,9 @@ target triple = "nvptx-nvidia-cuda"
 
 ; CHECK: .visible .global .align 1 .b8 test2[1] = {228};
 @test2 = local_unnamed_addr addrspace(1) constant <4 x i2> <i2 0, i2 1, i2 2, i2 3>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test3[2] = {228, 4};
+ at test3 = local_unnamed_addr addrspace(1) constant <6 x i2> <i2 0, i2 1, i2 2, i2 3, i2 0, i2 1>, align 1
+
+; CHECK: .visible .global .align 1 .b8 test4[1] = {1};
+ at test4 = local_unnamed_addr addrspace(1) constant <1 x i4> <i4 1>, align 1

>From 6677b6cbd1d7252d2aca06a0dc2551986fb16eae Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Fri, 3 Apr 2026 00:12:06 +0000
Subject: [PATCH 5/7] Address review comments.

- Simplify logic to process chunks and tail padding.
- Fold element retrieval and tail padding zeros into helper lambda.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 62 +++++++++++------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 392cc050ce64d..25f95a2fbdcf1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1766,13 +1766,31 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
   assert(8 % ElemTySize == 0 && "Element type size must evenly divide a byte.");
   // Number of elements to merge to form a full byte.
   unsigned ChunkSize = 8 / ElemTySize;
+  unsigned NumChunks = NumElems / ChunkSize;
+  unsigned NumTailElems = NumElems % ChunkSize;
+
+  // Helper lambda to constant-fold sub-vector of sub-byte type elements into
+  // i8. Start and end indices of the sub-vector is provided, along with number
+  // of padding zeros if required.
+  auto ConvertSubCVtoInt8 = [this, &ElemTy](const ConstantVector *CV,
+                                            unsigned Start, unsigned End,
+                                            unsigned NumPaddingZeros) {
+    // Collect elements to create sub-vector.
+    SmallVector<Constant *, 8> SubCVElems;
+    for (unsigned I = Start; I < End; ++I)
+      SubCVElems.push_back(CV->getAggregateElement(I));
+
+    // Optionally pad with zeros.
+    if (NumPaddingZeros) {
+      SmallVector<Constant *, 8> Zeros(NumPaddingZeros,
+                                       ConstantInt::getNullValue(ElemTy));
+      SubCVElems.append(Zeros);
+    }
 
-  // Helper lambda to constant-fold array of sub-byte constants to i8.
-  auto ConvertSubCVtoInt8 = [this](ArrayRef<Constant *> SubCVElems) {
     auto SubCV = ConstantVector::get(SubCVElems);
     Type *Int8Ty = IntegerType::get(SubCV->getContext(), 8);
 
-    // Merge elements of the chunk using ConstantFolding.
+    // Merge elements of the sub-vector using ConstantFolding.
     ConstantInt *MergedElem =
         dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
             ConstantExpr::getBitCast(const_cast<Constant *>(SubCV), Int8Ty),
@@ -1787,37 +1805,17 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
 
   // Iterate through elements of vector one chunk at a time and buffer that
   // chunk.
-  unsigned TailPaddedChunkBegin = 0;
-  bool NeedTailPadding = false;
-  for (unsigned I = 0; I < NumElems; I += ChunkSize) {
-    // If we have less elements than chunk size, then break to pad the final
-    // chunk with zeroes.
-    if (NumElems - I < ChunkSize) {
-      TailPaddedChunkBegin = I;
-      NeedTailPadding = true;
-      break;
-    }
-
-    // Collect elements in chunk to create sub-vector.
-    SmallVector<Constant *, 8> SubCVElems;
-    for (unsigned J = I; J < ChunkSize; ++J)
-      SubCVElems.push_back(CV->getAggregateElement(J));
-
-    // Buffer merged element.
-    bufferLEByte(ConvertSubCVtoInt8(SubCVElems), 0, aggBuffer);
-  }
+  for (unsigned I = 0; I < NumChunks; ++I)
+    bufferLEByte(
+        ConvertSubCVtoInt8(CV, I, I + ChunkSize, 0 /*NumPaddingZeros*/), 0,
+        aggBuffer);
 
   // For unevenly sized vectors add tail padding zeros.
-  if (NeedTailPadding) {
-    SmallVector<Constant *, 8> TailPaddedElems;
-    for (unsigned I = TailPaddedChunkBegin; I < NumElems; ++I)
-      TailPaddedElems.push_back(CV->getAggregateElement(I));
-
-    unsigned NumPaddingZeros = ChunkSize - (NumElems - TailPaddedChunkBegin);
-    TailPaddedElems.append(
-        SmallVector(NumPaddingZeros, ConstantInt::getNullValue(ElemTy)));
-
-    bufferLEByte(ConvertSubCVtoInt8(TailPaddedElems), 0, aggBuffer);
+  if (NumTailElems > 0) {
+    unsigned TailStart = NumElems - NumTailElems;
+    unsigned NumPaddingZeros = ChunkSize - NumTailElems;
+    bufferLEByte(ConvertSubCVtoInt8(CV, TailStart, NumElems, NumPaddingZeros),
+                 0, aggBuffer);
   }
 }
 

>From 44ef6a09d4f64b694695be70862ac7befda6babb Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Fri, 3 Apr 2026 21:44:00 +0000
Subject: [PATCH 6/7] Address review comments.

- Rename variables to avoid "chunk" terminology.
- Avoid temp array while tail padding zeros.
- Fold single use variables into call-site.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 29 +++++++++--------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 25f95a2fbdcf1..f193c122dd994 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1765,27 +1765,24 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
   assert(ElemTySize < 8 && "Expected sub-byte data type.");
   assert(8 % ElemTySize == 0 && "Element type size must evenly divide a byte.");
   // Number of elements to merge to form a full byte.
-  unsigned ChunkSize = 8 / ElemTySize;
-  unsigned NumChunks = NumElems / ChunkSize;
-  unsigned NumTailElems = NumElems % ChunkSize;
+  unsigned NumElemsPerByte = 8 / ElemTySize;
+  unsigned NumCompleteBytes = NumElems / NumElemsPerByte;
+  unsigned NumTailElems = NumElems % NumElemsPerByte;
 
   // Helper lambda to constant-fold sub-vector of sub-byte type elements into
   // i8. Start and end indices of the sub-vector is provided, along with number
   // of padding zeros if required.
   auto ConvertSubCVtoInt8 = [this, &ElemTy](const ConstantVector *CV,
                                             unsigned Start, unsigned End,
-                                            unsigned NumPaddingZeros) {
+                                            unsigned NumPaddingZeros = 0) {
     // Collect elements to create sub-vector.
     SmallVector<Constant *, 8> SubCVElems;
     for (unsigned I = Start; I < End; ++I)
       SubCVElems.push_back(CV->getAggregateElement(I));
 
     // Optionally pad with zeros.
-    if (NumPaddingZeros) {
-      SmallVector<Constant *, 8> Zeros(NumPaddingZeros,
-                                       ConstantInt::getNullValue(ElemTy));
-      SubCVElems.append(Zeros);
-    }
+    for (auto _ : llvm::seq(NumPaddingZeros))
+      SubCVElems.push_back(ConstantInt::getNullValue(ElemTy));
 
     auto SubCV = ConstantVector::get(SubCVElems);
     Type *Int8Ty = IntegerType::get(SubCV->getContext(), 8);
@@ -1805,18 +1802,14 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
 
   // Iterate through elements of vector one chunk at a time and buffer that
   // chunk.
-  for (unsigned I = 0; I < NumChunks; ++I)
-    bufferLEByte(
-        ConvertSubCVtoInt8(CV, I, I + ChunkSize, 0 /*NumPaddingZeros*/), 0,
-        aggBuffer);
+  for (unsigned I = 0; I < NumCompleteBytes; ++I)
+    bufferLEByte(ConvertSubCVtoInt8(CV, I, I + NumElemsPerByte), 0, aggBuffer);
 
   // For unevenly sized vectors add tail padding zeros.
-  if (NumTailElems > 0) {
-    unsigned TailStart = NumElems - NumTailElems;
-    unsigned NumPaddingZeros = ChunkSize - NumTailElems;
-    bufferLEByte(ConvertSubCVtoInt8(CV, TailStart, NumElems, NumPaddingZeros),
+  if (NumTailElems > 0)
+    bufferLEByte(ConvertSubCVtoInt8(CV, NumElems - NumTailElems, NumElems,
+                                    NumElemsPerByte - NumTailElems),
                  0, aggBuffer);
-  }
 }
 
 /// lowerConstantForGV - Return an MCExpr for the given Constant.  This is mostly

>From 95ce18b8849da007282d3bca66db5a14968d749c Mon Sep 17 00:00:00 2001
From: Karthik Senthil <ksenthilkuma at nvidia.com>
Date: Fri, 3 Apr 2026 22:24:36 +0000
Subject: [PATCH 7/7] Address review comment.

- Update loops over integer sequence to use llvm::seq for consistency.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index f193c122dd994..942ebb88307e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1777,7 +1777,7 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
                                             unsigned NumPaddingZeros = 0) {
     // Collect elements to create sub-vector.
     SmallVector<Constant *, 8> SubCVElems;
-    for (unsigned I = Start; I < End; ++I)
+    for (unsigned I : llvm::seq(Start, End))
       SubCVElems.push_back(CV->getAggregateElement(I));
 
     // Optionally pad with zeros.
@@ -1802,7 +1802,7 @@ void NVPTXAsmPrinter::bufferAggregateConstVec(const ConstantVector *CV,
 
   // Iterate through elements of vector one chunk at a time and buffer that
   // chunk.
-  for (unsigned I = 0; I < NumCompleteBytes; ++I)
+  for (unsigned I : llvm::seq(NumCompleteBytes))
     bufferLEByte(ConvertSubCVtoInt8(CV, I, I + NumElemsPerByte), 0, aggBuffer);
 
   // For unevenly sized vectors add tail padding zeros.