[PATCH] D101923: [TableGen] Make the NUL character invalid in .td files

Wed May 5 10:04:01 PDT 2021

Paul-C-Anagnostopoulos created this revision.
Paul-C-Anagnostopoulos added reviewers: dblaikie, jansvoboda11, craig.topper.
Herald added a subscriber: hiraditya.
Paul-C-Anagnostopoulos requested review of this revision.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

This little revision makes the NUL character officially invalid in .td files.

Note that the TableGen lexer does not consistently use getNextChar() to obtain characters from the source files. So some NULs, such as those in // comments, are simply ignored.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D101923

Files:
  llvm/lib/TableGen/TGLexer.cpp


Index: llvm/lib/TableGen/TGLexer.cpp
===================================================================

--- llvm/lib/TableGen/TGLexer.cpp
+++ llvm/lib/TableGen/TGLexer.cpp
@@ -109,15 +109,23 @@
   default:
     return (unsigned char)CurChar;
   case 0: {
-    // A nul character in the stream is either the end of the current buffer or
-    // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf.end())
-      return 0;  // Just whitespace.
-
-    // Otherwise, return end of file.
-    --CurPtr;  // Another call to lex will return EOF again.
-    return EOF;
+    // A NUL character in the stream is either the end of the current buffer or
+    // a spurious NUL in the file.  Disambiguate that here.
+    if (CurPtr-1 == CurBuf.end()) {
+      --CurPtr; // Arrange for another call to return EOF again.
+      return EOF;
+    }
+    PrintError(getLoc(), "NUL character is invalid in source; treated as space");
+    return ' ';
   }
+
+////    if (CurPtr-1 != CurBuf.end())
+////      return 0;  // Just whitespace.
+////
+////    // Otherwise, return end of file.
+////    --CurPtr;  // Another call to lex will return EOF again.
+////    return EOF;
+////  }
   case '\n':
   case '\r':
     // Handle the newline character by ignoring it and incrementing the line
@@ -197,7 +205,7 @@
     PrintFatalError("getNextChar() must never return '\r'");
     return tgtok::Error;
 
-  case 0:
+////  case 0:
   case ' ':
   case '\t':
     // Ignore whitespace.
@@ -415,24 +423,31 @@
   return false;
 }
 
+/// Skip over the comment by finding the next CR or LF. Or we may end up
+/// at the end of the buffer.
 void TGLexer::SkipBCPLComment() {
   ++CurPtr;  // skip the second slash.
-  while (true) {
-    switch (*CurPtr) {
-    case '\n':
-    case '\r':
-      return;  // Newline is end of comment.
-    case 0:
-      // If this is the end of the buffer, end the comment.
-      if (CurPtr == CurBuf.end())
-        return;
-      break;
-    }
-    // Otherwise, skip the character.
-    ++CurPtr;
-  }
+  auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data());
+  CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
 }
 
+////  ++CurPtr;  // skip the second slash.
+////  while (true) {
+////    switch (*CurPtr) {
+////    case '\n':
+////    case '\r':
+////      return;  // Newline is end of comment.
+////    case 0:
+////      // If this is the end of the buffer, end the comment.
+////      if (CurPtr == CurBuf.end())
+////        return;
+////      break;
+////    }
+////    // Otherwise, skip the character.
+////    ++CurPtr;
+////  }
+////}
+
 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 /// is that we allow nesting.
 bool TGLexer::SkipCComment() {


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D101923.343096.patch
Type: text/x-patch
Size: 2776 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210505/f50a72bd/attachment.bin>