[flang-commits] [flang] [llvm] [flang] [flang-rt] Addition of the Fortran 2023 TOKENIZE intrinsic. (PR #181030)

Wed Feb 25 08:17:33 PST 2026

================
@@ -571,6 +572,419 @@ static RT_API_ATTRS void MaxMin(Descriptor &accumulator, const Descriptor &x,
   }
 }
 
+template <typename CHAR>
+static inline RT_API_ATTRS bool TokenizeIsInSet(
+    CHAR ch, const CHAR *set, std::size_t setChars) {
+  for (std::size_t j{0}; j < setChars; ++j) {
+    if (set[j] == ch) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Pad the token with spaces.
+template <typename CHAR>
+static inline RT_API_ATTRS void TokenizeFillBlanks(
+    CHAR *to, std::size_t chars) {
+  if (chars == 0) {
+    return;
+  }
+  if constexpr (std::is_same_v<CHAR, char>) {
+    runtime::memset(to, ' ', chars);
+  } else {
+    for (std::size_t j{0}; j < chars; ++j) {
+      to[j] = static_cast<CHAR>(' ');
+    }
+  }
+}
+
+struct TokenizeAnalysis {
+  std::size_t tokenCount{0};
+  std::size_t maxTokenLen{0}; // in characters
+};
+
+template <typename CHAR>
+static RT_API_ATTRS TokenizeAnalysis AnalyzeTokenize(const CHAR *str,
+    std::size_t strChars, const CHAR *set, std::size_t setChars) {
+  TokenizeAnalysis analysis;
+  if (strChars == 0) {
+    return analysis;
+  }
+  if (setChars == 0) {
+    analysis.tokenCount = 1;
+    analysis.maxTokenLen = strChars;
+    return analysis;
+  }
+
+  // Split STRING at each delimiter character. This produces empty tokens
+  // when delimiters are consecutive or when STRING starts/ends with a
+  // delimiter.
+  std::size_t tokenStart{0};
+  for (std::size_t pos{0}; pos < strChars; ++pos) {
+    if (TokenizeIsInSet(str[pos], set, setChars)) {
+      analysis.maxTokenLen = std::max(analysis.maxTokenLen, pos - tokenStart);
+      analysis.tokenCount++;
+      tokenStart = pos + 1;
+    }
+  }
+  analysis.maxTokenLen = std::max(analysis.maxTokenLen, strChars - tokenStart);
+  analysis.tokenCount++;
+
+  return analysis;
+}
+// Allocates and populates the result arrays for TOKENIZE Form 1.
+template <typename CHAR>
+static RT_API_ATTRS void TokenizeFillForm1(Descriptor &tokens,
+    Descriptor *separator, const Descriptor &string, const CHAR *str,
+    std::size_t strChars, const CHAR *set, std::size_t setChars,
+    const TokenizeAnalysis &analysis, Terminator &terminator) {
+
+  // (Re)allocate TOKENS.
+  if (tokens.IsAllocated()) {
+    tokens.Deallocate();
+  }
+  SubscriptValue tokensExtent[1]{
+      static_cast<SubscriptValue>(analysis.tokenCount)};
+  std::size_t tokenElemBytes{
+      analysis.tokenCount == 0 ? 0 : analysis.maxTokenLen * sizeof(CHAR)};
+  tokens.Establish(string.type(), tokenElemBytes, nullptr, 1, tokensExtent,
+      CFI_attribute_allocatable);
+  tokens.GetDimension(0).SetBounds(1, tokensExtent[0]);
+  if (tokens.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
+    terminator.Crash("TOKENIZE: could not allocate TOKENS array");
+  }
+
+  // (Re)allocate SEPARATOR if present.
+  std::size_t sepCount{analysis.tokenCount > 0 ? analysis.tokenCount - 1 : 0};
+  std::size_t sepElemBytes{sizeof(CHAR)};
+  if (separator) {
+    if (separator->IsAllocated()) {
+      separator->Deallocate();
+    }
+    SubscriptValue sepExtent[1]{static_cast<SubscriptValue>(sepCount)};
+    separator->Establish(string.type(), sepElemBytes, nullptr, 1, sepExtent,
+        CFI_attribute_allocatable);
+    separator->GetDimension(0).SetBounds(1, sepExtent[0]);
+    if (separator->Allocate(kNoAsyncObject) != CFI_SUCCESS) {
+      terminator.Crash("TOKENIZE: could not allocate SEPARATOR array");
+    }
+  }
+
+  if (analysis.tokenCount == 0) {
+    return;
+  }
+
+  // Populate tokens and separators.
+  if (setChars == 0) {
+    // One token (possibly empty) equal to STRING.
+    if (tokenElemBytes > 0) {
+      CHAR *tokDest{tokens.OffsetElement<CHAR>(0)};
+      TokenizeFillBlanks(tokDest, analysis.maxTokenLen);
----------------
kwyatt-ext wrote:

Yes it would be.  Since they are set programmatically in AnalyzeTokenize to be equal whenever setChars == 0, that condition shouldn't occur.

If you are implying that the call to TokenizeFillBlanks is not necessary, that is true.  It could be removed, it was just there to make the approach to populating the token return object consistent - and hopefully more clear when reading.

https://github.com/llvm/llvm-project/pull/181030