[llvm] 32eed88 - Reapply "[NVPTX] Use the mask() operator to initialize packed structs with pointers"

Mon Jul 18 09:57:45 PDT 2022

Author: Igor Kudrin
Date: 2022-07-18T20:56:26+04:00
New Revision: 32eed8828e0c6b2efdd5b31fa3f18aef1c6c7d7b

URL: https://github.com/llvm/llvm-project/commit/32eed8828e0c6b2efdd5b31fa3f18aef1c6c7d7b
DIFF: https://github.com/llvm/llvm-project/commit/32eed8828e0c6b2efdd5b31fa3f18aef1c6c7d7b.diff

LOG: Reapply "[NVPTX] Use the mask() operator to initialize packed structs with pointers"

The original patch revealed an issue of reading incorrect values on BE hosts.
That is now changed to use `endian::read32le()` and `endian::read64le()`.

Original commit message:

The current implementation assumes that all pointers used in the
initialization of an aggregate are aligned according to the pointer size
of the target; that might not be so if the object is packed. In that
case, an array of .u8 should be used and pointers should be decorated
with the mask() operator.

The operator was introduced in PTX ISA 7.1, so an error is issued if the
case is detected for an earlier version.

Differential Revision: https://reviews.llvm.org/D127504

Added: 
    llvm/test/CodeGen/NVPTX/packed-aggr.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
    llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 422e47b3d97d7..640c1e88f38b3 100644

--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -73,8 +73,10 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -1172,11 +1174,26 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols()) {
             unsigned int ptrSize = MAI->getCodePointerSize();
-            O << " .u" << ptrSize * 8 << " ";
-            getSymbol(GVar)->print(O, MAI);
-            O << "[" << ElementSize / ptrSize << "] = {";
-            aggBuffer.printWords(O);
-            O << "}";
+            if (ElementSize % ptrSize ||
+                !aggBuffer.allSymbolsAligned(ptrSize)) {
+              // Print in bytes and use the mask() operator for pointers.
+              if (!STI.hasMaskOperator())
+                report_fatal_error(
+                    "initialized packed aggregate with pointers '" +
+                    GVar->getName() +
+                    "' requires at least PTX ISA version 7.1");
+              O << " .u8 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[" << ElementSize << "] = {";
+              aggBuffer.printBytes(O);
+              O << "}";
+            } else {
+              O << " .u" << ptrSize * 8 << " ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[" << ElementSize / ptrSize << "] = {";
+              aggBuffer.printWords(O);
+              O << "}";
+            }
           } else {
             O << " .b8 ";
             getSymbol(GVar)->print(O, MAI);
@@ -1233,10 +1250,33 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
 }
 
 void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
-  for (unsigned int pos = 0; pos < size; ++pos) {
+  unsigned int ptrSize = AP.MAI->getCodePointerSize();
+  symbolPosInBuffer.push_back(size);
+  unsigned int nSym = 0;
+  unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+  for (unsigned int pos = 0; pos < size;) {
     if (pos)
       os << ", ";
-    os << (unsigned int)buffer[pos];
+    if (pos != nextSymbolPos) {
+      os << (unsigned int)buffer[pos];
+      ++pos;
+      continue;
+    }
+    // Generate a per-byte mask() operator for the symbol, which looks like:
+    //   .global .u8 addr[] = {0xFF(foo), 0xFF00(foo), 0xFF0000(foo), ...};
+    // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers
+    std::string symText;
+    llvm::raw_string_ostream oss(symText);
+    printSymbol(nSym, oss);
+    for (unsigned i = 0; i < ptrSize; ++i) {
+      if (i)
+        os << ", ";
+      llvm::write_hex(os, 0xFFULL << i * 8, HexPrintStyle::PrefixUpper);
+      os << "(" << symText << ")";
+    }
+    pos += ptrSize;
+    nextSymbolPos = symbolPosInBuffer[++nSym];
+    assert(nextSymbolPos >= pos);
   }
 }
 
@@ -1255,9 +1295,9 @@ void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) {
       assert(nextSymbolPos % ptrSize == 0);
       assert(nextSymbolPos >= pos + ptrSize);
     } else if (ptrSize == 4)
-      os << *(uint32_t *)(&buffer[pos]);
+      os << support::endian::read32le(&buffer[pos]);
     else
-      os << *(uint64_t *)(&buffer[pos]);
+      os << support::endian::read64le(&buffer[pos]);
   }
 }
 

diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index ebf9e29b5a627..710c089e33251 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -61,26 +61,31 @@ class MCOperand;
 class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   class AggBuffer {
-    // Used to buffer the emitted string for initializing global
-    // aggregates.
+    // Used to buffer the emitted string for initializing global aggregates.
     //
-    // Normally an aggregate (array, vector or structure) is emitted
-    // as a u8[]. However, if one element/field of the aggregate
-    // is a non-NULL address, then the aggregate is emitted as u32[]
-    // or u64[].
+    // Normally an aggregate (array, vector, or structure) is emitted as a u8[].
+    // However, if either element/field of the aggregate is a non-NULL address,
+    // and all such addresses are properly aligned, then the aggregate is
+    // emitted as u32[] or u64[]. In the case of unaligned addresses, the
+    // aggregate is emitted as u8[], and the mask() operator is used for all
+    // pointers.
     //
-    // We first layout the aggregate in 'buffer' in bytes, except for
-    // those symbol addresses. For the i-th symbol address in the
-    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
-    // are filled with 0s. symbolPosInBuffer[i-1] records its position
-    // in 'buffer', and Symbols[i-1] records the Value*.
+    // We first layout the aggregate in 'buffer' in bytes, except for those
+    // symbol addresses. For the i-th symbol address in the aggregate, its
+    // corresponding 4-byte or 8-byte elements in 'buffer' are filled with 0s.
+    // symbolPosInBuffer[i-1] records its position in 'buffer', and Symbols[i-1]
+    // records the Value*.
     //
-    // Once we have this AggBuffer setup, we can choose how to print
-    // it out.
+    // Once we have this AggBuffer setup, we can choose how to print it out.
   public:
     // number of symbol addresses
     unsigned numSymbols() const { return Symbols.size(); }
 
+    bool allSymbolsAligned(unsigned ptrSize) const {
+      return llvm::all_of(symbolPosInBuffer,
+                          [=](unsigned pos) { return pos % ptrSize == 0; });
+    }
+
   private:
     const unsigned size;   // size of the buffer in bytes
     std::vector<unsigned char> buffer; // the buffer

diff  --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 9a249d3da3d56..cea3dce3f1c55 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -77,6 +77,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasImageHandles() const;
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool allowFP16Math() const;
+  bool hasMaskOperator() const { return PTXVersion >= 71; }
   unsigned int getSmVersion() const { return SmVersion; }
   std::string getTargetName() const { return TargetName; }
 

diff  --git a/llvm/test/CodeGen/NVPTX/packed-aggr.ll b/llvm/test/CodeGen/NVPTX/packed-aggr.ll
new file mode 100644
index 0000000000000..dba888a87e483
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/packed-aggr.ll
@@ -0,0 +1,95 @@
+; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx70 2>&1 | \
+; RUN:   FileCheck %s --check-prefix=ERROR
+
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | \
+; RUN:   FileCheck %s --check-prefixes=CHECK,CHECK32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | \
+; RUN:   FileCheck %s --check-prefixes=CHECK,CHECK64
+; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
+; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
+
+;; Test that packed structs with symbol references are represented using the
+;; mask() operator.
+
+declare void @func()
+ at p = addrspace(1) global i8 0
+; CHECK: .extern .func func
+; CHECK: .u8 p;
+
+%t1 = type <{ i16, i8*, i8, void ()*, i8*, i32 }>
+ at s1 = addrspace(1) global %t1 <{
+; ERROR: initialized packed aggregate with pointers 's1' requires at least PTX ISA version 7.1
+; CHECK32: .global .align 1 .u8 s1[19] = {
+; CHECK64: .global .align 1 .u8 s1[31] = {
+    i16 12,
+; CHECK-SAME:   12, 0,
+    i8* addrspacecast (i8 addrspace(1)* @p to i8*),
+; CHECK-SAME:   0xFF(generic(p)), 0xFF00(generic(p)), 0xFF0000(generic(p)), 0xFF000000(generic(p)),
+; CHECK64-SAME: 0xFF00000000(generic(p)), 0xFF0000000000(generic(p)), 0xFF000000000000(generic(p)), 0xFF00000000000000(generic(p)),
+    i8 34,
+; CHECK-SAME:   34
+    void ()* @func,
+; CHECK-SAME:   0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
+; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
+    i8* addrspacecast (i8 addrspace(1)* getelementptr (i8, i8 addrspace(1)* @p, i32 3) to i8*),
+; CHECK-SAME:   0xFF(generic(p)+3), 0xFF00(generic(p)+3), 0xFF0000(generic(p)+3), 0xFF000000(generic(p)+3),
+; CHECK64-SAME: 0xFF00000000(generic(p)+3), 0xFF0000000000(generic(p)+3), 0xFF000000000000(generic(p)+3), 0xFF00000000000000(generic(p)+3),
+    i32 56 }>, align 1
+; CHECK-SAME:   56, 0, 0, 0};
+
+;; Test a case than an unaligned pointer is in a nested struct.
+
+%t2i = type <{ void ()* }>
+%t2o = type { i8, %t2i, i32 }
+ at s2 = addrspace(1) global %t2o {
+; CHECK32: .global .align 8 .u8 s2[12] = {
+; CHECK64: .global .align 8 .u8 s2[16] = {
+    i8 12,
+; CHECK-SAME:   12,
+    %t2i <{ void()* @func }>,
+; CHECK-SAME:   0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
+; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
+    i32 34}
+; CHECK-SAME:   0, 0, 0,
+; CHECK-SAME:   34, 0, 0, 0};
+
+;; Test that a packed struct which size is not multiple of the pointer size
+;; is printed in bytes and uses the mask() operator for pointers even though
+;; the pointers are aligned.
+
+%t3 = type <{ void ()*, i8 }>
+ at s3 = addrspace(1) global %t3 <{
+; CHECK32: .global .align 1 .u8 s3[5] = {
+; CHECK64: .global .align 1 .u8 s3[9] = {
+    void ()* @func,
+; CHECK-SAME:   0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
+; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
+    i8 56 }>, align 1
+; CHECK-SAME:   56};
+
+;; Test that a packed struct with aligned pointers is printed in words.
+
+%t4 = type <{ void ()*, i64 }>
+ at s4 = addrspace(1) global %t4 <{
+; CHECK32: .global .align 1 .u32 s4[3] = {
+; CHECK64: .global .align 1 .u64 s4[2] = {
+    void()* @func,
+; CHECK-SAME:   func,
+    i64 15}>, align 1
+; CHECK32-SAME: 15, 0};
+; CHECK64-SAME: 15};
+
+;; Test that a packed struct with unaligned pointers inside an array is handled.
+
+%t5 = type <{ void ()*, i16 }>
+ at a5 = addrspace(1) global [2 x %t5] [%t5 <{ void()* @func, i16 5 }>, %t5 <{ void()* @func, i16 9 }> ]
+; CHECK32: .global .align 8 .u8 a5[12] = {
+; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 5, 0,
+; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 9, 0};
+; CHECK64: .global .align 8 .u8 a5[20] = {
+; CHECK64-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
+; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
+; CHECK64-SAME: 5, 0,
+; CHECK64-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
+; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
+; CHECK64-SAME: 9, 0};