[flang-commits] [flang] [flang][runtime] Handle multi-byte characters while tabbing (PR #101388)

via flang-commits flang-commits at lists.llvm.org
Wed Jul 31 12:00:09 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-flang-runtime

Author: Peter Klausler (klausler)

<details>
<summary>Changes</summary>

When repositioning within the current record with control edit descriptors (Xn, Tn, TLn, TRn), deal with multiple-byte character encodings.  This affects only external I/O to units with UTF-8 encoding.

---
Full diff: https://github.com/llvm/llvm-project/pull/101388.diff


9 Files Affected:

- (modified) flang/runtime/format-implementation.h (+86-19) 
- (modified) flang/runtime/internal-unit.cpp (+23) 
- (modified) flang/runtime/internal-unit.h (+2) 
- (modified) flang/runtime/io-stmt.cpp (+13) 
- (modified) flang/runtime/io-stmt.h (+7-3) 
- (modified) flang/runtime/unit.cpp (+13-1) 
- (modified) flang/runtime/unit.h (+2) 
- (modified) flang/runtime/utf.cpp (+11) 
- (modified) flang/runtime/utf.h (+3) 


``````````diff
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 45d4bd641f6f6..b0b6f9a75b969 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -113,6 +113,84 @@ RT_API_ATTRS int FormatControl<CONTEXT>::GetIntField(
   return result;
 }
 
+// Xn, TRn, TLn
+template <typename CONTEXT>
+static RT_API_ATTRS bool RelativeTabbing(CONTEXT &context, int n) {
+  ConnectionState &connection{context.GetConnectionState()};
+  if constexpr (std::is_same_v<CONTEXT,
+                    ExternalFormattedIoStatementState<Direction::Input>> ||
+      std::is_same_v<CONTEXT,
+          ExternalFormattedIoStatementState<Direction::Output>>) {
+    if (n != 0 && connection.isUTF8) {
+      const char *p{};
+      if (n > 0) { // Xn or TRn
+        std::size_t bytesLeft{context.GetNextInputBytes(p)};
+        // Skip 'n' multi-byte characters.  If that's more than are in the
+        // current record, that's valid -- the program can position past the
+        // end and then reposition back with Tn or TLn.
+        for (; n > 0 && bytesLeft; --n) {
+          std::size_t byteCount{MeasureUTF8Bytes(*p)};
+          if (byteCount > bytesLeft) {
+            break;
+          }
+          context.HandleRelativePosition(byteCount);
+          bytesLeft -= byteCount;
+          // Don't call GotChar(byteCount), these don't count towards SIZE=
+          p += byteCount;
+        }
+      } else { // n < 0: TLn
+        n = -n;
+        if (std::int64_t excess{connection.positionInRecord -
+                connection.recordLength.value_or(connection.positionInRecord)};
+            excess > 0) {
+          // Have tabbed past the end of the record
+          if (excess >= n) {
+            context.HandleRelativePosition(-n);
+            return true;
+          }
+          context.HandleRelativePosition(-excess);
+          n -= excess;
+        }
+        std::size_t bytesLeft{context.GetPreviousInputBytes(p)};
+        // Go back 'n' multi-byte characters.
+        for (; n > 0 && bytesLeft; --n) {
+          std::size_t byteCount{MeasurePreviousUTF8Bytes(p, bytesLeft)};
+          context.HandleRelativePosition(-byteCount);
+          bytesLeft -= byteCount;
+          p -= byteCount;
+        }
+      }
+    }
+  }
+  if (connection.internalIoCharKind > 1) {
+    n *= connection.internalIoCharKind;
+  }
+  context.HandleRelativePosition(n);
+  return true;
+}
+
+// Tn
+template <typename CONTEXT>
+static RT_API_ATTRS bool AbsoluteTabbing(CONTEXT &context, int n) {
+  ConnectionState &connection{context.GetConnectionState()};
+  n = n > 0 ? n - 1 : 0; // convert 1-based position to 0-based offset
+  if constexpr (std::is_same_v<CONTEXT,
+                    ExternalFormattedIoStatementState<Direction::Input>> ||
+      std::is_same_v<CONTEXT,
+          ExternalFormattedIoStatementState<Direction::Output>>) {
+    if (connection.isUTF8) {
+      // Reset to the beginning of the record, then TR(n-1)
+      connection.HandleAbsolutePosition(0);
+      return RelativeTabbing(context, n);
+    }
+  }
+  if (connection.internalIoCharKind > 1) {
+    n *= connection.internalIoCharKind;
+  }
+  context.HandleAbsolutePosition(n);
+  return true;
+}
+
 template <typename CONTEXT>
 static RT_API_ATTRS void HandleControl(
     CONTEXT &context, char ch, char next, int n) {
@@ -169,12 +247,7 @@ static RT_API_ATTRS void HandleControl(
     }
     break;
   case 'X':
-    if (!next) {
-      ConnectionState &connection{context.GetConnectionState()};
-      if (connection.internalIoCharKind > 1) {
-        n *= connection.internalIoCharKind;
-      }
-      context.HandleRelativePosition(n);
+    if (!next && RelativeTabbing(context, n)) {
       return;
     }
     break;
@@ -190,19 +263,13 @@ static RT_API_ATTRS void HandleControl(
     break;
   case 'T': {
     if (!next) { // Tn
-      --n; // convert 1-based to 0-based
-    }
-    ConnectionState &connection{context.GetConnectionState()};
-    if (connection.internalIoCharKind > 1) {
-      n *= connection.internalIoCharKind;
-    }
-    if (!next) { // Tn
-      context.HandleAbsolutePosition(n);
-      return;
-    }
-    if (next == 'L' || next == 'R') { // TLn & TRn
-      context.HandleRelativePosition(next == 'L' ? -n : n);
-      return;
+      if (AbsoluteTabbing(context, n)) {
+        return;
+      }
+    } else if (next == 'R' || next == 'L') { // TRn / TLn
+      if (RelativeTabbing(context, next == 'L' ? -n : n)) {
+        return;
+      }
     }
   } break;
   default:
diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp
index 4097ea659edd4..1c569789d95c7 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -80,6 +80,7 @@ RT_API_ATTRS bool InternalDescriptorUnit<DIR>::Emit(
 template <Direction DIR>
 RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
     const char *&p, IoErrorHandler &handler) {
+  p = nullptr;
   if constexpr (DIR == Direction::Output) {
     handler.Crash("InternalDescriptorUnit<Direction::Output>::"
                   "GetNextInputBytes() called");
@@ -98,6 +99,28 @@ RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
   }
 }
 
+template <Direction DIR>
+RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetPreviousInputBytes(
+    const char *&p, IoErrorHandler &handler) {
+  p = nullptr;
+  if constexpr (DIR == Direction::Output) {
+    handler.Crash("InternalDescriptorUnit<Direction::Output>::"
+                  "GetPreviousInputBytes() called");
+    return 0;
+  } else {
+    const char *record{CurrentRecord()};
+    if (!record) {
+      handler.SignalEnd();
+      return 0;
+    } else {
+      if (positionInRecord < recordLength.value_or(positionInRecord)) {
+        p = &record[positionInRecord];
+      }
+      return positionInRecord - leftTabLimit.value_or(0);
+    }
+  }
+}
+
 template <Direction DIR>
 RT_API_ATTRS bool InternalDescriptorUnit<DIR>::AdvanceRecord(
     IoErrorHandler &handler) {
diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h
index bcd38b62468af..f33d8df6b7e4e 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -31,6 +31,8 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
 
   RT_API_ATTRS bool Emit(const char *, std::size_t, IoErrorHandler &);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+  RT_API_ATTRS std::size_t GetPreviousInputBytes(
+      const char *&, IoErrorHandler &);
   RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
   RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
   RT_API_ATTRS std::int64_t InquirePos();
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 1a5d32ecd8c5a..9c63409166821 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -32,6 +32,11 @@ std::size_t IoStatementBase::GetNextInputBytes(const char *&p) {
   return 0;
 }
 
+std::size_t IoStatementBase::GetPreviousInputBytes(const char *&p) {
+  p = nullptr;
+  return 0;
+}
+
 bool IoStatementBase::AdvanceRecord(int) { return false; }
 
 void IoStatementBase::BackspaceRecord() {}
@@ -105,6 +110,8 @@ std::size_t InternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
   return unit_.GetNextInputBytes(p, *this);
 }
 
+// InternalIoStatementState<DIR>::GetPreviousInputBytes() not needed or defined
+
 template <Direction DIR>
 bool InternalIoStatementState<DIR>::AdvanceRecord(int n) {
   while (n-- > 0) {
@@ -413,6 +420,12 @@ std::size_t ExternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
   return unit().GetNextInputBytes(p, *this);
 }
 
+template <Direction DIR>
+std::size_t ExternalIoStatementState<DIR>::GetPreviousInputBytes(
+    const char *&p) {
+  return unit().GetPreviousInputBytes(p, *this);
+}
+
 template <Direction DIR>
 bool ExternalIoStatementState<DIR>::AdvanceRecord(int n) {
   while (n-- > 0) {
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 6053aeb777b7a..bec466f39683a 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -93,6 +93,7 @@ class IoStatementState {
       const char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
   RT_API_ATTRS bool AdvanceRecord(int = 1);
   RT_API_ATTRS void BackspaceRecord();
   RT_API_ATTRS void HandleRelativePosition(std::int64_t byteOffset);
@@ -132,9 +133,9 @@ class IoStatementState {
   RT_API_ATTRS Fortran::common::optional<char32_t> GetCurrentChar(
       std::size_t &byteCount);
 
-  // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
-  // are always in units of bytes, not characters; the distinction matters
-  // for internal input from CHARACTER(KIND=2 and 4).
+  // The result of CueUpInput() and the "remaining" arguments to SkipSpaces()
+  // and NextInField() are always in units of bytes, not characters; the
+  // distinction matters for internal input from CHARACTER(KIND=2 and 4).
 
   // For fixed-width fields, return the number of remaining bytes.
   // Skip over leading blanks.
@@ -279,6 +280,7 @@ class IoStatementBase : public IoErrorHandler {
   RT_API_ATTRS bool Receive(
       char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
   RT_API_ATTRS bool AdvanceRecord(int);
   RT_API_ATTRS void BackspaceRecord();
   RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -473,6 +475,7 @@ class ExternalIoStatementState : public ExternalIoStatementBase,
   RT_API_ATTRS bool Emit(
       const char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
   RT_API_ATTRS bool AdvanceRecord(int = 1);
   RT_API_ATTRS void BackspaceRecord();
   RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -539,6 +542,7 @@ class ChildIoStatementState : public IoStatementBase,
   RT_API_ATTRS bool Emit(
       const char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
   RT_API_ATTRS void HandleRelativePosition(std::int64_t);
   RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
 
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 5c5bca835f3d8..78fc2ea18b71a 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -134,7 +134,8 @@ bool ExternalFileUnit::Receive(char *data, std::size_t bytes,
 
 std::size_t ExternalFileUnit::GetNextInputBytes(
     const char *&p, IoErrorHandler &handler) {
-  RUNTIME_CHECK(handler, direction_ == Direction::Input);
+  // Don't require that the current direction be Input; this is also needed
+  // for relative tabbing on output to UTF-8.
   std::size_t length{1};
   if (auto recl{EffectiveRecordLength()}) {
     if (positionInRecord < *recl) {
@@ -148,6 +149,17 @@ std::size_t ExternalFileUnit::GetNextInputBytes(
   return p ? length : 0;
 }
 
+std::size_t ExternalFileUnit::GetPreviousInputBytes(
+    const char *&p, IoErrorHandler &handler) {
+  RUNTIME_CHECK(handler, direction_ == Direction::Input);
+  if (positionInRecord <= recordLength.value_or(positionInRecord)) {
+    p = Frame() + recordOffsetInFrame_ + positionInRecord;
+  } else {
+    p = nullptr;
+  }
+  return positionInRecord - leftTabLimit.value_or(0);
+}
+
 const char *ExternalFileUnit::FrameNextInput(
     IoErrorHandler &handler, std::size_t bytes) {
   RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index 83f839e205a48..ae050d3bd7653 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -166,6 +166,8 @@ class ExternalFileUnit : public ConnectionState,
   RT_API_ATTRS bool Receive(
       char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+  RT_API_ATTRS std::size_t GetPreviousInputBytes(
+      const char *&, IoErrorHandler &);
   RT_API_ATTRS bool BeginReadingRecord(IoErrorHandler &);
   RT_API_ATTRS void FinishReadingRecord(IoErrorHandler &);
   RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index f4b38d5225ce1..b09819cb2f736 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -44,6 +44,17 @@ RT_OFFLOAD_VAR_GROUP_END
 #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
+
+std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
+  // Scan back over UTF-8 continuation bytes, if any
+  for (std::size_t n{1}; n <= limit; ++n) {
+    if ((end[-n] & 0xc0) != 0x80) {
+      return n;
+    }
+  }
+  return limit;
+}
+
 // Non-minimal encodings are accepted.
 Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 29670d54b3eb6..10c2d61484217 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -58,6 +58,9 @@ static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
   return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
 }
 
+RT_API_ATTRS std::size_t MeasurePreviousUTF8Bytes(
+    const char *end, std::size_t limit);
+
 // Ensure that all bytes are present in sequence in the input buffer
 // before calling; use MeasureUTF8Bytes(first byte) to count them.
 RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);

``````````

</details>


https://github.com/llvm/llvm-project/pull/101388


More information about the flang-commits mailing list