[flang-commits] [flang] [llvm] [flang] [flang-rt] Addition of the Fortran 2023 TOKENIZE intrinsic. (PR #181030)
Tom Eccles via flang-commits
flang-commits at lists.llvm.org
Tue Feb 24 02:45:53 PST 2026
================
@@ -571,6 +572,419 @@ static RT_API_ATTRS void MaxMin(Descriptor &accumulator, const Descriptor &x,
}
}
+template <typename CHAR>
+static inline RT_API_ATTRS bool TokenizeIsInSet(
+ CHAR ch, const CHAR *set, std::size_t setChars) {
+ for (std::size_t j{0}; j < setChars; ++j) {
+ if (set[j] == ch) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Pad the token with spaces.
+template <typename CHAR>
+static inline RT_API_ATTRS void TokenizeFillBlanks(
+ CHAR *to, std::size_t chars) {
+ if (chars == 0) {
+ return;
+ }
+ if constexpr (std::is_same_v<CHAR, char>) {
+ runtime::memset(to, ' ', chars);
+ } else {
+ for (std::size_t j{0}; j < chars; ++j) {
+ to[j] = static_cast<CHAR>(' ');
+ }
+ }
+}
+
+struct TokenizeAnalysis {
+ std::size_t tokenCount{0};
+ std::size_t maxTokenLen{0}; // in characters
+};
+
+template <typename CHAR>
+static RT_API_ATTRS TokenizeAnalysis AnalyzeTokenize(const CHAR *str,
+ std::size_t strChars, const CHAR *set, std::size_t setChars) {
+ TokenizeAnalysis analysis;
+ if (strChars == 0) {
+ return analysis;
+ }
+ if (setChars == 0) {
+ analysis.tokenCount = 1;
+ analysis.maxTokenLen = strChars;
+ return analysis;
+ }
+
+ // Split STRING at each delimiter character. This produces empty tokens
+ // when delimiters are consecutive or when STRING starts/ends with a
+ // delimiter.
+ std::size_t tokenStart{0};
+ for (std::size_t pos{0}; pos < strChars; ++pos) {
+ if (TokenizeIsInSet(str[pos], set, setChars)) {
+ analysis.maxTokenLen = std::max(analysis.maxTokenLen, pos - tokenStart);
+ analysis.tokenCount++;
+ tokenStart = pos + 1;
+ }
+ }
+ analysis.maxTokenLen = std::max(analysis.maxTokenLen, strChars - tokenStart);
+ analysis.tokenCount++;
+
+ return analysis;
+}
+// Allocates and populates the result arrays for TOKENIZE Form 1.
+template <typename CHAR>
+static RT_API_ATTRS void TokenizeFillForm1(Descriptor &tokens,
+ Descriptor *separator, const Descriptor &string, const CHAR *str,
+ std::size_t strChars, const CHAR *set, std::size_t setChars,
+ const TokenizeAnalysis &analysis, Terminator &terminator) {
+
+ // (Re)allocate TOKENS.
+ if (tokens.IsAllocated()) {
+ tokens.Deallocate();
+ }
+ SubscriptValue tokensExtent[1]{
+ static_cast<SubscriptValue>(analysis.tokenCount)};
+ std::size_t tokenElemBytes{
+ analysis.tokenCount == 0 ? 0 : analysis.maxTokenLen * sizeof(CHAR)};
+ tokens.Establish(string.type(), tokenElemBytes, nullptr, 1, tokensExtent,
+ CFI_attribute_allocatable);
+ tokens.GetDimension(0).SetBounds(1, tokensExtent[0]);
+ if (tokens.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
+ terminator.Crash("TOKENIZE: could not allocate TOKENS array");
+ }
+
+ // (Re)allocate SEPARATOR if present.
+ std::size_t sepCount{analysis.tokenCount > 0 ? analysis.tokenCount - 1 : 0};
+ std::size_t sepElemBytes{sizeof(CHAR)};
+ if (separator) {
+ if (separator->IsAllocated()) {
+ separator->Deallocate();
+ }
+ SubscriptValue sepExtent[1]{static_cast<SubscriptValue>(sepCount)};
+ separator->Establish(string.type(), sepElemBytes, nullptr, 1, sepExtent,
+ CFI_attribute_allocatable);
+ separator->GetDimension(0).SetBounds(1, sepExtent[0]);
+ if (separator->Allocate(kNoAsyncObject) != CFI_SUCCESS) {
+ terminator.Crash("TOKENIZE: could not allocate SEPARATOR array");
+ }
+ }
+
+ if (analysis.tokenCount == 0) {
+ return;
+ }
+
+ // Populate tokens and separators.
+ if (setChars == 0) {
+ // One token (possibly empty) equal to STRING.
+ if (tokenElemBytes > 0) {
+ CHAR *tokDest{tokens.OffsetElement<CHAR>(0)};
+ TokenizeFillBlanks(tokDest, analysis.maxTokenLen);
+ if (strChars > 0) {
+ runtime::memcpy(tokDest, str, strChars * sizeof(CHAR));
+ }
+ }
+ return;
+ }
+
+ std::size_t tokenIndex{0};
+ std::size_t sepIndex{0};
+
+ auto storeToken = [&](std::size_t tokenStart, std::size_t tokenEnd) {
+ std::size_t tokenLen{tokenEnd - tokenStart};
+ if (tokenElemBytes > 0) {
+ // Each element is stored in a fixed-size slot of `tokenElemBytes`.
+ CHAR *tokDest{tokens.OffsetElement<CHAR>(tokenIndex * tokenElemBytes)};
+ TokenizeFillBlanks(tokDest, analysis.maxTokenLen);
----------------
tblah wrote:
We only need to fill the bytes which are not going to be written to by the memcpy
https://github.com/llvm/llvm-project/pull/181030
More information about the flang-commits
mailing list