[flang-commits] [flang] [llvm] [flang] [flang-rt] Addition of the Fortran 2023 TOKENIZE intrinsic. (PR #181030)

Tue Feb 24 02:45:53 PST 2026

================
@@ -571,6 +572,419 @@ static RT_API_ATTRS void MaxMin(Descriptor &accumulator, const Descriptor &x,
   }
 }
 
+template <typename CHAR>
+static inline RT_API_ATTRS bool TokenizeIsInSet(
+    CHAR ch, const CHAR *set, std::size_t setChars) {
+  for (std::size_t j{0}; j < setChars; ++j) {
+    if (set[j] == ch) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Pad the token with spaces.
+template <typename CHAR>
+static inline RT_API_ATTRS void TokenizeFillBlanks(
+    CHAR *to, std::size_t chars) {
+  if (chars == 0) {
+    return;
+  }
+  if constexpr (std::is_same_v<CHAR, char>) {
+    runtime::memset(to, ' ', chars);
+  } else {
+    for (std::size_t j{0}; j < chars; ++j) {
+      to[j] = static_cast<CHAR>(' ');
+    }
+  }
+}
+
+struct TokenizeAnalysis {
+  std::size_t tokenCount{0};
+  std::size_t maxTokenLen{0}; // in characters
+};
+
+template <typename CHAR>
+static RT_API_ATTRS TokenizeAnalysis AnalyzeTokenize(const CHAR *str,
+    std::size_t strChars, const CHAR *set, std::size_t setChars) {
+  TokenizeAnalysis analysis;
+  if (strChars == 0) {
+    return analysis;
+  }
+  if (setChars == 0) {
+    analysis.tokenCount = 1;
+    analysis.maxTokenLen = strChars;
+    return analysis;
+  }
+
+  // Split STRING at each delimiter character. This produces empty tokens
+  // when delimiters are consecutive or when STRING starts/ends with a
+  // delimiter.
+  std::size_t tokenStart{0};
+  for (std::size_t pos{0}; pos < strChars; ++pos) {
+    if (TokenizeIsInSet(str[pos], set, setChars)) {
+      analysis.maxTokenLen = std::max(analysis.maxTokenLen, pos - tokenStart);
+      analysis.tokenCount++;
+      tokenStart = pos + 1;
+    }
+  }
+  analysis.maxTokenLen = std::max(analysis.maxTokenLen, strChars - tokenStart);
+  analysis.tokenCount++;
+
+  return analysis;
+}
+// Allocates and populates the result arrays for TOKENIZE Form 1.
+template <typename CHAR>
+static RT_API_ATTRS void TokenizeFillForm1(Descriptor &tokens,
+    Descriptor *separator, const Descriptor &string, const CHAR *str,
+    std::size_t strChars, const CHAR *set, std::size_t setChars,
+    const TokenizeAnalysis &analysis, Terminator &terminator) {
+
+  // (Re)allocate TOKENS.
+  if (tokens.IsAllocated()) {
+    tokens.Deallocate();
+  }
+  SubscriptValue tokensExtent[1]{
+      static_cast<SubscriptValue>(analysis.tokenCount)};
+  std::size_t tokenElemBytes{
+      analysis.tokenCount == 0 ? 0 : analysis.maxTokenLen * sizeof(CHAR)};
+  tokens.Establish(string.type(), tokenElemBytes, nullptr, 1, tokensExtent,
+      CFI_attribute_allocatable);
+  tokens.GetDimension(0).SetBounds(1, tokensExtent[0]);
+  if (tokens.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
+    terminator.Crash("TOKENIZE: could not allocate TOKENS array");
+  }
+
+  // (Re)allocate SEPARATOR if present.
+  std::size_t sepCount{analysis.tokenCount > 0 ? analysis.tokenCount - 1 : 0};
+  std::size_t sepElemBytes{sizeof(CHAR)};
+  if (separator) {
+    if (separator->IsAllocated()) {
+      separator->Deallocate();
+    }
+    SubscriptValue sepExtent[1]{static_cast<SubscriptValue>(sepCount)};
+    separator->Establish(string.type(), sepElemBytes, nullptr, 1, sepExtent,
+        CFI_attribute_allocatable);
+    separator->GetDimension(0).SetBounds(1, sepExtent[0]);
+    if (separator->Allocate(kNoAsyncObject) != CFI_SUCCESS) {
+      terminator.Crash("TOKENIZE: could not allocate SEPARATOR array");
+    }
+  }
+
+  if (analysis.tokenCount == 0) {
+    return;
+  }
+
+  // Populate tokens and separators.
+  if (setChars == 0) {
+    // One token (possibly empty) equal to STRING.
+    if (tokenElemBytes > 0) {
+      CHAR *tokDest{tokens.OffsetElement<CHAR>(0)};
+      TokenizeFillBlanks(tokDest, analysis.maxTokenLen);
+      if (strChars > 0) {
+        runtime::memcpy(tokDest, str, strChars * sizeof(CHAR));
+      }
+    }
+    return;
+  }
+
+  std::size_t tokenIndex{0};
+  std::size_t sepIndex{0};
+
+  auto storeToken = [&](std::size_t tokenStart, std::size_t tokenEnd) {
+    std::size_t tokenLen{tokenEnd - tokenStart};
+    if (tokenElemBytes > 0) {
+      // Each element is stored in a fixed-size slot of `tokenElemBytes`.
+      CHAR *tokDest{tokens.OffsetElement<CHAR>(tokenIndex * tokenElemBytes)};
+      TokenizeFillBlanks(tokDest, analysis.maxTokenLen);
----------------
tblah wrote:

We only need to fill the bytes which are not going to be written to by the memcpy

https://github.com/llvm/llvm-project/pull/181030