[flang-commits] [flang] c2a95ad - [flang][runtime] Handle multi-byte characters while tabbing (#101388)
via flang-commits
flang-commits at lists.llvm.org
Fri Aug 2 12:05:01 PDT 2024
Author: Peter Klausler
Date: 2024-08-02T12:04:58-07:00
New Revision: c2a95ad25c65acede2492ac83039150f9522c3ae
URL: https://github.com/llvm/llvm-project/commit/c2a95ad25c65acede2492ac83039150f9522c3ae
DIFF: https://github.com/llvm/llvm-project/commit/c2a95ad25c65acede2492ac83039150f9522c3ae.diff
LOG: [flang][runtime] Handle multi-byte characters while tabbing (#101388)
When repositioning within the current record with control edit
descriptors (Xn, Tn, TLn, TRn), deal with multiple-byte character
encodings. This affects only external I/O to units with UTF-8 encoding.
Added:
Modified:
flang/runtime/format-implementation.h
flang/runtime/internal-unit.cpp
flang/runtime/internal-unit.h
flang/runtime/io-stmt.cpp
flang/runtime/io-stmt.h
flang/runtime/unit.cpp
flang/runtime/unit.h
flang/runtime/utf.cpp
flang/runtime/utf.h
Removed:
################################################################################
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 45d4bd641f6f6..74254bebe6e7a 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -113,6 +113,84 @@ RT_API_ATTRS int FormatControl<CONTEXT>::GetIntField(
return result;
}
+// Xn, TRn, TLn
+template <typename CONTEXT>
+static RT_API_ATTRS bool RelativeTabbing(CONTEXT &context, int n) {
+ ConnectionState &connection{context.GetConnectionState()};
+ if constexpr (std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Input>> ||
+ std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Output>>) {
+ if (n != 0 && connection.isUTF8) {
+ const char *p{};
+ if (n > 0) { // Xn or TRn
+ // Skip 'n' multi-byte characters. If that's more than are in the
+ // current record, that's valid -- the program can position past the
+ // end and then reposition back with Tn or TLn.
+ std::size_t bytesLeft{context.ViewBytesInRecord(p, true)};
+ for (; n > 0 && bytesLeft && p; --n) {
+ std::size_t byteCount{MeasureUTF8Bytes(*p)};
+ if (byteCount > bytesLeft) {
+ break;
+ }
+ context.HandleRelativePosition(byteCount);
+ bytesLeft -= byteCount;
+ // Don't call GotChar(byteCount), these don't count towards SIZE=
+ p += byteCount;
+ }
+ } else { // n < 0: TLn
+ n = -n;
+ if (std::int64_t excess{connection.positionInRecord -
+ connection.recordLength.value_or(connection.positionInRecord)};
+ excess > 0) {
+ // Have tabbed past the end of the record
+ if (excess >= n) {
+ context.HandleRelativePosition(-n);
+ return true;
+ }
+ context.HandleRelativePosition(-excess);
+ n -= excess;
+ }
+ std::size_t bytesLeft{context.ViewBytesInRecord(p, false)};
+ // Go back 'n' multi-byte characters.
+ for (; n > 0 && bytesLeft && p; --n) {
+ std::size_t byteCount{MeasurePreviousUTF8Bytes(p, bytesLeft)};
+ context.HandleRelativePosition(-byteCount);
+ bytesLeft -= byteCount;
+ p -= byteCount;
+ }
+ }
+ }
+ }
+ if (connection.internalIoCharKind > 1) {
+ n *= connection.internalIoCharKind;
+ }
+ context.HandleRelativePosition(n);
+ return true;
+}
+
+// Tn
+template <typename CONTEXT>
+static RT_API_ATTRS bool AbsoluteTabbing(CONTEXT &context, int n) {
+ ConnectionState &connection{context.GetConnectionState()};
+ n = n > 0 ? n - 1 : 0; // convert 1-based position to 0-based offset
+ if constexpr (std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Input>> ||
+ std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Output>>) {
+ if (connection.isUTF8) {
+ // Reset to the beginning of the record, then TR(n-1)
+ connection.HandleAbsolutePosition(0);
+ return RelativeTabbing(context, n);
+ }
+ }
+ if (connection.internalIoCharKind > 1) {
+ n *= connection.internalIoCharKind;
+ }
+ context.HandleAbsolutePosition(n);
+ return true;
+}
+
template <typename CONTEXT>
static RT_API_ATTRS void HandleControl(
CONTEXT &context, char ch, char next, int n) {
@@ -169,12 +247,7 @@ static RT_API_ATTRS void HandleControl(
}
break;
case 'X':
- if (!next) {
- ConnectionState &connection{context.GetConnectionState()};
- if (connection.internalIoCharKind > 1) {
- n *= connection.internalIoCharKind;
- }
- context.HandleRelativePosition(n);
+ if (!next && RelativeTabbing(context, n)) {
return;
}
break;
@@ -190,19 +263,13 @@ static RT_API_ATTRS void HandleControl(
break;
case 'T': {
if (!next) { // Tn
- --n; // convert 1-based to 0-based
- }
- ConnectionState &connection{context.GetConnectionState()};
- if (connection.internalIoCharKind > 1) {
- n *= connection.internalIoCharKind;
- }
- if (!next) { // Tn
- context.HandleAbsolutePosition(n);
- return;
- }
- if (next == 'L' || next == 'R') { // TLn & TRn
- context.HandleRelativePosition(next == 'L' ? -n : n);
- return;
+ if (AbsoluteTabbing(context, n)) {
+ return;
+ }
+ } else if (next == 'R' || next == 'L') { // TRn / TLn
+ if (RelativeTabbing(context, next == 'L' ? -n : n)) {
+ return;
+ }
}
} break;
default:
diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp
index 4097ea659edd4..f28700ee01581 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -80,6 +80,7 @@ RT_API_ATTRS bool InternalDescriptorUnit<DIR>::Emit(
template <Direction DIR>
RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
const char *&p, IoErrorHandler &handler) {
+ p = nullptr;
if constexpr (DIR == Direction::Output) {
handler.Crash("InternalDescriptorUnit<Direction::Output>::"
"GetNextInputBytes() called");
@@ -98,6 +99,28 @@ RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
}
}
+template <Direction DIR>
+RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::ViewBytesInRecord(
+ const char *&p, bool forward) const {
+ p = nullptr;
+ auto recl{recordLength.value_or(positionInRecord)};
+ const char *record{CurrentRecord()};
+ if (forward) {
+ if (positionInRecord < recl) {
+ if (record) {
+ p = &record[positionInRecord];
+ }
+ return recl - positionInRecord;
+ }
+ } else {
+ if (record && positionInRecord <= recl) {
+ p = &record[positionInRecord];
+ }
+ return positionInRecord - leftTabLimit.value_or(0);
+ }
+ return 0;
+}
+
template <Direction DIR>
RT_API_ATTRS bool InternalDescriptorUnit<DIR>::AdvanceRecord(
IoErrorHandler &handler) {
diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h
index bcd38b62468af..a0ee6353eeda3 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -31,6 +31,7 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
RT_API_ATTRS bool Emit(const char *, std::size_t, IoErrorHandler &);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+ RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
RT_API_ATTRS std::int64_t InquirePos();
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 1a5d32ecd8c5a..265bd0dc9d949 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -32,6 +32,12 @@ std::size_t IoStatementBase::GetNextInputBytes(const char *&p) {
return 0;
}
+std::size_t IoStatementBase::ViewBytesInRecord(
+ const char *&p, bool forward) const {
+ p = nullptr;
+ return 0;
+}
+
bool IoStatementBase::AdvanceRecord(int) { return false; }
void IoStatementBase::BackspaceRecord() {}
@@ -105,6 +111,8 @@ std::size_t InternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
return unit_.GetNextInputBytes(p, *this);
}
+// InternalIoStatementState<DIR>::ViewBytesInRecord() not needed or defined
+
template <Direction DIR>
bool InternalIoStatementState<DIR>::AdvanceRecord(int n) {
while (n-- > 0) {
@@ -413,6 +421,12 @@ std::size_t ExternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
return unit().GetNextInputBytes(p, *this);
}
+template <Direction DIR>
+std::size_t ExternalIoStatementState<DIR>::ViewBytesInRecord(
+ const char *&p, bool forward) const {
+ return unit().ViewBytesInRecord(p, forward);
+}
+
template <Direction DIR>
bool ExternalIoStatementState<DIR>::AdvanceRecord(int n) {
while (n-- > 0) {
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 6053aeb777b7a..d67d1ec80afce 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -93,6 +93,7 @@ class IoStatementState {
const char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
RT_API_ATTRS bool AdvanceRecord(int = 1);
RT_API_ATTRS void BackspaceRecord();
RT_API_ATTRS void HandleRelativePosition(std::int64_t byteOffset);
@@ -132,9 +133,9 @@ class IoStatementState {
RT_API_ATTRS Fortran::common::optional<char32_t> GetCurrentChar(
std::size_t &byteCount);
- // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
- // are always in units of bytes, not characters; the distinction matters
- // for internal input from CHARACTER(KIND=2 and 4).
+ // The result of CueUpInput() and the "remaining" arguments to SkipSpaces()
+ // and NextInField() are always in units of bytes, not characters; the
+ // distinction matters for internal input from CHARACTER(KIND=2 and 4).
// For fixed-width fields, return the number of remaining bytes.
// Skip over leading blanks.
@@ -279,6 +280,7 @@ class IoStatementBase : public IoErrorHandler {
RT_API_ATTRS bool Receive(
char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
RT_API_ATTRS bool AdvanceRecord(int);
RT_API_ATTRS void BackspaceRecord();
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -448,6 +450,7 @@ class ExternalIoStatementBase : public IoStatementBase {
RT_API_ATTRS ExternalIoStatementBase(
ExternalFileUnit &, const char *sourceFile = nullptr, int sourceLine = 0);
RT_API_ATTRS ExternalFileUnit &unit() { return unit_; }
+ RT_API_ATTRS const ExternalFileUnit &unit() const { return unit_; }
RT_API_ATTRS MutableModes &mutableModes();
RT_API_ATTRS ConnectionState &GetConnectionState();
RT_API_ATTRS int asynchronousID() const { return asynchronousID_; }
@@ -473,6 +476,7 @@ class ExternalIoStatementState : public ExternalIoStatementBase,
RT_API_ATTRS bool Emit(
const char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
RT_API_ATTRS bool AdvanceRecord(int = 1);
RT_API_ATTRS void BackspaceRecord();
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -539,6 +543,7 @@ class ChildIoStatementState : public IoStatementBase,
RT_API_ATTRS bool Emit(
const char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 5c5bca835f3d8..4aee8397d477e 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -148,6 +148,24 @@ std::size_t ExternalFileUnit::GetNextInputBytes(
return p ? length : 0;
}
+std::size_t ExternalFileUnit::ViewBytesInRecord(
+ const char *&p, bool forward) const {
+ p = nullptr;
+ auto recl{recordLength.value_or(positionInRecord)};
+ if (forward) {
+ if (positionInRecord < recl) {
+ p = Frame() + recordOffsetInFrame_ + positionInRecord;
+ return recl - positionInRecord;
+ }
+ } else {
+ if (positionInRecord <= recl) {
+ p = Frame() + recordOffsetInFrame_ + positionInRecord;
+ }
+ return positionInRecord - leftTabLimit.value_or(0);
+ }
+ return 0;
+}
+
const char *ExternalFileUnit::FrameNextInput(
IoErrorHandler &handler, std::size_t bytes) {
RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index 83f839e205a48..a3ea268681680 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -166,6 +166,7 @@ class ExternalFileUnit : public ConnectionState,
RT_API_ATTRS bool Receive(
char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+ RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
RT_API_ATTRS bool BeginReadingRecord(IoErrorHandler &);
RT_API_ATTRS void FinishReadingRecord(IoErrorHandler &);
RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index f4b38d5225ce1..b09819cb2f736 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -44,6 +44,17 @@ RT_OFFLOAD_VAR_GROUP_END
#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
RT_OFFLOAD_API_GROUP_BEGIN
+
+std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
+ // Scan back over UTF-8 continuation bytes, if any
+ for (std::size_t n{1}; n <= limit; ++n) {
+ if ((end[-n] & 0xc0) != 0x80) {
+ return n;
+ }
+ }
+ return limit;
+}
+
// Non-minimal encodings are accepted.
Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 29670d54b3eb6..10c2d61484217 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -58,6 +58,9 @@ static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
}
+RT_API_ATTRS std::size_t MeasurePreviousUTF8Bytes(
+ const char *end, std::size_t limit);
+
// Ensure that all bytes are present in sequence in the input buffer
// before calling; use MeasureUTF8Bytes(first byte) to count them.
RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);
More information about the flang-commits
mailing list