[clang] [HLSL][RootSignature] Implement Lexing of DescriptorTables (PR #122981)

Tue Feb 11 09:27:06 PST 2025

================
@@ -0,0 +1,194 @@
+#include "clang/Parse/ParseHLSLRootSignature.h"
+
+namespace clang {
+namespace hlsl {
+
+// Lexer Definitions
+
+static bool IsNumberChar(char C) {
+  // TODO(#126565): extend for float support exponents
+  return isdigit(C); // integer support
+}
+
+bool RootSignatureLexer::LexNumber(RootSignatureToken &Result) {
+  // NumericLiteralParser does not handle the sign so we will manually apply it
+  bool Negative = Buffer.front() == '-';
+  bool Signed = Negative || Buffer.front() == '+';
+  if (Signed)
+    AdvanceBuffer();
+
+  // Retrieve the possible number
+  StringRef NumSpelling = Buffer.take_while(IsNumberChar);
+
+  // Catch when there is a '+' or '-' specified but no literal value after.
+  // This is invalid but the NumericLiteralParser will accept this as valid.
+  if (NumSpelling.empty()) {
+    PP.getDiagnostics().Report(Result.TokLoc,
+                               diag::err_hlsl_expected_number_literal);
+    return true;
+  }
+
+  // Parse the numeric value and do semantic checks on its specification
+  clang::NumericLiteralParser Literal(NumSpelling, SourceLoc,
+                                      PP.getSourceManager(), PP.getLangOpts(),
+                                      PP.getTargetInfo(), PP.getDiagnostics());
+  if (Literal.hadError)
+    return true; // Error has already been reported so just return
+
+  // Note: if IsNumberChar allows for hexidecimal we will need to turn this
+  // into a diagnostics for potential fixed-point literals
+  assert(Literal.isIntegerLiteral() && "IsNumberChar will only support digits");
+
+  // Retrieve the number value to store into the token
+  Result.Kind = TokenKind::int_literal;
+
+  // NOTE: for compabibility with DXC, we will treat any integer with '+' as an
+  // unsigned integer
+  llvm::APSInt X = llvm::APSInt(32, !Negative);
+  if (Literal.GetIntegerValue(X)) {
+    // Report that the value has overflowed
+    PP.getDiagnostics().Report(Result.TokLoc,
+                               diag::err_hlsl_number_literal_overflow)
+        << (unsigned)!Signed << NumSpelling;
+    return true;
+  }
+
+  X = Negative ? -X : X;
+  Result.NumLiteral = APValue(X);
+
+  AdvanceBuffer(NumSpelling.size());
+  return false;
+}
+
+bool RootSignatureLexer::LexToken(RootSignatureToken &Result) {
+  // Discard any leading whitespace
+  AdvanceBuffer(Buffer.take_while(isspace).size());
+
+  // Record where this token is in the text for usage in parser diagnostics
+  Result = RootSignatureToken(SourceLoc);
+
+  char C = Buffer.front();
+
+  // Punctuators
+  switch (C) {
+#define PUNCTUATOR(X, Y)                                                       \
+  case Y: {                                                                    \
+    Result.Kind = TokenKind::pu_##X;                                           \
+    AdvanceBuffer();                                                           \
+    return false;                                                              \
+  }
+#include "clang/Parse/HLSLRootSignatureTokenKinds.def"
+  default:
+    break;
+  }
+
+  // Numeric constant
+  if (isdigit(C) || C == '-' || C == '+')
+    return LexNumber(Result);
+
+  // All following tokens require at least one additional character
+  if (Buffer.size() <= 1) {
+    PP.getDiagnostics().Report(Result.TokLoc, diag::err_hlsl_invalid_token);
+    return true;
+  }
+
+  // Peek at the next character to deteremine token type
+  char NextC = Buffer[1];
+
+  // Registers: [tsub][0-9+]
+  if ((C == 't' || C == 's' || C == 'u' || C == 'b') && isdigit(NextC)) {
+    AdvanceBuffer();
+
+    if (LexNumber(Result))
+      return true; // Error parsing number which is already reported
+
+    // Lex number could also parse a float so ensure it was an unsigned int
+    if (Result.Kind != TokenKind::int_literal ||
+        Result.NumLiteral.getInt().isSigned()) {
+      // Return invalid number literal for register error
+      PP.getDiagnostics().Report(Result.TokLoc,
+                                 diag::err_hlsl_invalid_register_literal);
+      return true;
+    }
+
+    // Convert character to the register type.
+    // This is done after LexNumber to override the TokenKind
+    switch (C) {
+    case 'b':
+      Result.Kind = TokenKind::bReg;
+      break;
+    case 't':
+      Result.Kind = TokenKind::tReg;
+      break;
+    case 'u':
+      Result.Kind = TokenKind::uReg;
+      break;
+    case 's':
+      Result.Kind = TokenKind::sReg;
+      break;
+    default:
+      llvm_unreachable("Switch for an expected token was not provided");
+    }
+    return false;
+  }
+
+  // Keywords and Enums:
+  StringRef TokSpelling =
+      Buffer.take_while([](char C) { return isalnum(C) || C == '_'; });
+
+  // Define a large string switch statement for all the keywords and enums
+  auto Switch = llvm::StringSwitch<TokenKind>(TokSpelling);
+#define KEYWORD(NAME) Switch.Case(#NAME, TokenKind::kw_##NAME);
+#define ENUM(NAME, LIT) Switch.CaseLower(LIT, TokenKind::en_##NAME);
+#include "clang/Parse/HLSLRootSignatureTokenKinds.def"
+
+  // Then attempt to retreive a string from it
+  auto Kind = Switch.Default(TokenKind::invalid);
+  if (Kind == TokenKind::invalid) {
+    PP.getDiagnostics().Report(Result.TokLoc, diag::err_hlsl_invalid_token);
+    return true;
+  }
+
+  Result.Kind = Kind;
+  AdvanceBuffer(TokSpelling.size());
+  return false;
+}
+
+bool RootSignatureLexer::ConsumeToken() {
+  // If we previously peeked then just copy the value over
+  if (NextToken && NextToken->Kind != TokenKind::end_of_stream) {
+    CurToken = *NextToken;
+    NextToken = std::nullopt;
+    return false;
+  }
+
+  // This will be implicity be true if NextToken->Kind == end_of_stream
+  if (EndOfBuffer()) {
+    // Report unexpected end of tokens error
+    PP.getDiagnostics().Report(SourceLoc,
+                               diag::err_hlsl_rootsig_unexpected_eos);
----------------
llvm-beanz wrote:

This is similar to the case above. Is it better for the lexer to report this, or to surface an end_of_stream token and have the parser report a more context-aware error? I tend to think the later.

https://github.com/llvm/llvm-project/pull/122981