[flang-commits] [flang] c2a95ad - [flang][runtime] Handle multi-byte characters while tabbing (#101388)

via flang-commits flang-commits at lists.llvm.org
Fri Aug 2 12:05:01 PDT 2024


Author: Peter Klausler
Date: 2024-08-02T12:04:58-07:00
New Revision: c2a95ad25c65acede2492ac83039150f9522c3ae

URL: https://github.com/llvm/llvm-project/commit/c2a95ad25c65acede2492ac83039150f9522c3ae
DIFF: https://github.com/llvm/llvm-project/commit/c2a95ad25c65acede2492ac83039150f9522c3ae.diff

LOG: [flang][runtime] Handle multi-byte characters while tabbing (#101388)

When repositioning within the current record with control edit
descriptors (Xn, Tn, TLn, TRn), deal with multiple-byte character
encodings. This affects only external I/O to units with UTF-8 encoding.

Added: 
    

Modified: 
    flang/runtime/format-implementation.h
    flang/runtime/internal-unit.cpp
    flang/runtime/internal-unit.h
    flang/runtime/io-stmt.cpp
    flang/runtime/io-stmt.h
    flang/runtime/unit.cpp
    flang/runtime/unit.h
    flang/runtime/utf.cpp
    flang/runtime/utf.h

Removed: 
    


################################################################################
diff  --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 45d4bd641f6f6..74254bebe6e7a 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -113,6 +113,84 @@ RT_API_ATTRS int FormatControl<CONTEXT>::GetIntField(
   return result;
 }
 
+// Xn, TRn, TLn
+template <typename CONTEXT>
+static RT_API_ATTRS bool RelativeTabbing(CONTEXT &context, int n) {
+  ConnectionState &connection{context.GetConnectionState()};
+  if constexpr (std::is_same_v<CONTEXT,
+                    ExternalFormattedIoStatementState<Direction::Input>> ||
+      std::is_same_v<CONTEXT,
+          ExternalFormattedIoStatementState<Direction::Output>>) {
+    if (n != 0 && connection.isUTF8) {
+      const char *p{};
+      if (n > 0) { // Xn or TRn
+        // Skip 'n' multi-byte characters.  If that's more than are in the
+        // current record, that's valid -- the program can position past the
+        // end and then reposition back with Tn or TLn.
+        std::size_t bytesLeft{context.ViewBytesInRecord(p, true)};
+        for (; n > 0 && bytesLeft && p; --n) {
+          std::size_t byteCount{MeasureUTF8Bytes(*p)};
+          if (byteCount > bytesLeft) {
+            break;
+          }
+          context.HandleRelativePosition(byteCount);
+          bytesLeft -= byteCount;
+          // Don't call GotChar(byteCount), these don't count towards SIZE=
+          p += byteCount;
+        }
+      } else { // n < 0: TLn
+        n = -n;
+        if (std::int64_t excess{connection.positionInRecord -
+                connection.recordLength.value_or(connection.positionInRecord)};
+            excess > 0) {
+          // Have tabbed past the end of the record
+          if (excess >= n) {
+            context.HandleRelativePosition(-n);
+            return true;
+          }
+          context.HandleRelativePosition(-excess);
+          n -= excess;
+        }
+        std::size_t bytesLeft{context.ViewBytesInRecord(p, false)};
+        // Go back 'n' multi-byte characters.
+        for (; n > 0 && bytesLeft && p; --n) {
+          std::size_t byteCount{MeasurePreviousUTF8Bytes(p, bytesLeft)};
+          context.HandleRelativePosition(-byteCount);
+          bytesLeft -= byteCount;
+          p -= byteCount;
+        }
+      }
+    }
+  }
+  if (connection.internalIoCharKind > 1) {
+    n *= connection.internalIoCharKind;
+  }
+  context.HandleRelativePosition(n);
+  return true;
+}
+
+// Tn
+template <typename CONTEXT>
+static RT_API_ATTRS bool AbsoluteTabbing(CONTEXT &context, int n) {
+  ConnectionState &connection{context.GetConnectionState()};
+  n = n > 0 ? n - 1 : 0; // convert 1-based position to 0-based offset
+  if constexpr (std::is_same_v<CONTEXT,
+                    ExternalFormattedIoStatementState<Direction::Input>> ||
+      std::is_same_v<CONTEXT,
+          ExternalFormattedIoStatementState<Direction::Output>>) {
+    if (connection.isUTF8) {
+      // Reset to the beginning of the record, then TR(n-1)
+      connection.HandleAbsolutePosition(0);
+      return RelativeTabbing(context, n);
+    }
+  }
+  if (connection.internalIoCharKind > 1) {
+    n *= connection.internalIoCharKind;
+  }
+  context.HandleAbsolutePosition(n);
+  return true;
+}
+
 template <typename CONTEXT>
 static RT_API_ATTRS void HandleControl(
     CONTEXT &context, char ch, char next, int n) {
@@ -169,12 +247,7 @@ static RT_API_ATTRS void HandleControl(
     }
     break;
   case 'X':
-    if (!next) {
-      ConnectionState &connection{context.GetConnectionState()};
-      if (connection.internalIoCharKind > 1) {
-        n *= connection.internalIoCharKind;
-      }
-      context.HandleRelativePosition(n);
+    if (!next && RelativeTabbing(context, n)) {
       return;
     }
     break;
@@ -190,19 +263,13 @@ static RT_API_ATTRS void HandleControl(
     break;
   case 'T': {
     if (!next) { // Tn
-      --n; // convert 1-based to 0-based
-    }
-    ConnectionState &connection{context.GetConnectionState()};
-    if (connection.internalIoCharKind > 1) {
-      n *= connection.internalIoCharKind;
-    }
-    if (!next) { // Tn
-      context.HandleAbsolutePosition(n);
-      return;
-    }
-    if (next == 'L' || next == 'R') { // TLn & TRn
-      context.HandleRelativePosition(next == 'L' ? -n : n);
-      return;
+      if (AbsoluteTabbing(context, n)) {
+        return;
+      }
+    } else if (next == 'R' || next == 'L') { // TRn / TLn
+      if (RelativeTabbing(context, next == 'L' ? -n : n)) {
+        return;
+      }
     }
   } break;
   default:

diff  --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp
index 4097ea659edd4..f28700ee01581 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -80,6 +80,7 @@ RT_API_ATTRS bool InternalDescriptorUnit<DIR>::Emit(
 template <Direction DIR>
 RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
     const char *&p, IoErrorHandler &handler) {
+  p = nullptr;
   if constexpr (DIR == Direction::Output) {
     handler.Crash("InternalDescriptorUnit<Direction::Output>::"
                   "GetNextInputBytes() called");
@@ -98,6 +99,28 @@ RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
   }
 }
 
+template <Direction DIR>
+RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::ViewBytesInRecord(
+    const char *&p, bool forward) const {
+  p = nullptr;
+  auto recl{recordLength.value_or(positionInRecord)};
+  const char *record{CurrentRecord()};
+  if (forward) {
+    if (positionInRecord < recl) {
+      if (record) {
+        p = &record[positionInRecord];
+      }
+      return recl - positionInRecord;
+    }
+  } else {
+    if (record && positionInRecord <= recl) {
+      p = &record[positionInRecord];
+    }
+    return positionInRecord - leftTabLimit.value_or(0);
+  }
+  return 0;
+}
+
 template <Direction DIR>
 RT_API_ATTRS bool InternalDescriptorUnit<DIR>::AdvanceRecord(
     IoErrorHandler &handler) {

diff  --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h
index bcd38b62468af..a0ee6353eeda3 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -31,6 +31,7 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
 
   RT_API_ATTRS bool Emit(const char *, std::size_t, IoErrorHandler &);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+  RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
   RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
   RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
   RT_API_ATTRS std::int64_t InquirePos();

diff  --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 1a5d32ecd8c5a..265bd0dc9d949 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -32,6 +32,12 @@ std::size_t IoStatementBase::GetNextInputBytes(const char *&p) {
   return 0;
 }
 
+std::size_t IoStatementBase::ViewBytesInRecord(
+    const char *&p, bool forward) const {
+  p = nullptr;
+  return 0;
+}
+
 bool IoStatementBase::AdvanceRecord(int) { return false; }
 
 void IoStatementBase::BackspaceRecord() {}
@@ -105,6 +111,8 @@ std::size_t InternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
   return unit_.GetNextInputBytes(p, *this);
 }
 
+// InternalIoStatementState<DIR>::ViewBytesInRecord() not needed or defined
+
 template <Direction DIR>
 bool InternalIoStatementState<DIR>::AdvanceRecord(int n) {
   while (n-- > 0) {
@@ -413,6 +421,12 @@ std::size_t ExternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
   return unit().GetNextInputBytes(p, *this);
 }
 
+template <Direction DIR>
+std::size_t ExternalIoStatementState<DIR>::ViewBytesInRecord(
+    const char *&p, bool forward) const {
+  return unit().ViewBytesInRecord(p, forward);
+}
+
 template <Direction DIR>
 bool ExternalIoStatementState<DIR>::AdvanceRecord(int n) {
   while (n-- > 0) {

diff  --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 6053aeb777b7a..d67d1ec80afce 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -93,6 +93,7 @@ class IoStatementState {
       const char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
   RT_API_ATTRS bool AdvanceRecord(int = 1);
   RT_API_ATTRS void BackspaceRecord();
   RT_API_ATTRS void HandleRelativePosition(std::int64_t byteOffset);
@@ -132,9 +133,9 @@ class IoStatementState {
   RT_API_ATTRS Fortran::common::optional<char32_t> GetCurrentChar(
       std::size_t &byteCount);
 
-  // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
-  // are always in units of bytes, not characters; the distinction matters
-  // for internal input from CHARACTER(KIND=2 and 4).
+  // The result of CueUpInput() and the "remaining" arguments to SkipSpaces()
+  // and NextInField() are always in units of bytes, not characters; the
+  // distinction matters for internal input from CHARACTER(KIND=2 and 4).
 
   // For fixed-width fields, return the number of remaining bytes.
   // Skip over leading blanks.
@@ -279,6 +280,7 @@ class IoStatementBase : public IoErrorHandler {
   RT_API_ATTRS bool Receive(
       char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
   RT_API_ATTRS bool AdvanceRecord(int);
   RT_API_ATTRS void BackspaceRecord();
   RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -448,6 +450,7 @@ class ExternalIoStatementBase : public IoStatementBase {
   RT_API_ATTRS ExternalIoStatementBase(
       ExternalFileUnit &, const char *sourceFile = nullptr, int sourceLine = 0);
   RT_API_ATTRS ExternalFileUnit &unit() { return unit_; }
+  RT_API_ATTRS const ExternalFileUnit &unit() const { return unit_; }
   RT_API_ATTRS MutableModes &mutableModes();
   RT_API_ATTRS ConnectionState &GetConnectionState();
   RT_API_ATTRS int asynchronousID() const { return asynchronousID_; }
@@ -473,6 +476,7 @@ class ExternalIoStatementState : public ExternalIoStatementBase,
   RT_API_ATTRS bool Emit(
       const char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
   RT_API_ATTRS bool AdvanceRecord(int = 1);
   RT_API_ATTRS void BackspaceRecord();
   RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -539,6 +543,7 @@ class ChildIoStatementState : public IoStatementBase,
   RT_API_ATTRS bool Emit(
       const char *, std::size_t bytes, std::size_t elementBytes = 0);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
   RT_API_ATTRS void HandleRelativePosition(std::int64_t);
   RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
 

diff  --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 5c5bca835f3d8..4aee8397d477e 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -148,6 +148,24 @@ std::size_t ExternalFileUnit::GetNextInputBytes(
   return p ? length : 0;
 }
 
+std::size_t ExternalFileUnit::ViewBytesInRecord(
+    const char *&p, bool forward) const {
+  p = nullptr;
+  auto recl{recordLength.value_or(positionInRecord)};
+  if (forward) {
+    if (positionInRecord < recl) {
+      p = Frame() + recordOffsetInFrame_ + positionInRecord;
+      return recl - positionInRecord;
+    }
+  } else {
+    if (positionInRecord <= recl) {
+      p = Frame() + recordOffsetInFrame_ + positionInRecord;
+    }
+    return positionInRecord - leftTabLimit.value_or(0);
+  }
+  return 0;
+}
+
 const char *ExternalFileUnit::FrameNextInput(
     IoErrorHandler &handler, std::size_t bytes) {
   RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);

diff  --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index 83f839e205a48..a3ea268681680 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -166,6 +166,7 @@ class ExternalFileUnit : public ConnectionState,
   RT_API_ATTRS bool Receive(
       char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
   RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+  RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
   RT_API_ATTRS bool BeginReadingRecord(IoErrorHandler &);
   RT_API_ATTRS void FinishReadingRecord(IoErrorHandler &);
   RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);

diff  --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index f4b38d5225ce1..b09819cb2f736 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -44,6 +44,17 @@ RT_OFFLOAD_VAR_GROUP_END
 #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
+
+std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
+  // Scan back over UTF-8 continuation bytes, if any
+  for (std::size_t n{1}; n <= limit; ++n) {
+    if ((end[-n] & 0xc0) != 0x80) {
+      return n;
+    }
+  }
+  return limit;
+}
+
 // Non-minimal encodings are accepted.
 Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};

diff  --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 29670d54b3eb6..10c2d61484217 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -58,6 +58,9 @@ static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
   return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
 }
 
+RT_API_ATTRS std::size_t MeasurePreviousUTF8Bytes(
+    const char *end, std::size_t limit);
+
 // Ensure that all bytes are present in sequence in the input buffer
 // before calling; use MeasureUTF8Bytes(first byte) to count them.
 RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);


        


More information about the flang-commits mailing list