[flang-commits] [flang] [flang][runtime] Handle multi-byte characters while tabbing (PR #101388)
via flang-commits
flang-commits at lists.llvm.org
Wed Jul 31 12:00:09 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-runtime
Author: Peter Klausler (klausler)
<details>
<summary>Changes</summary>
When repositioning within the current record with control edit descriptors (Xn, Tn, TLn, TRn), deal with multiple-byte character encodings. This affects only external I/O to units with UTF-8 encoding.
---
Full diff: https://github.com/llvm/llvm-project/pull/101388.diff
9 Files Affected:
- (modified) flang/runtime/format-implementation.h (+86-19)
- (modified) flang/runtime/internal-unit.cpp (+23)
- (modified) flang/runtime/internal-unit.h (+2)
- (modified) flang/runtime/io-stmt.cpp (+13)
- (modified) flang/runtime/io-stmt.h (+7-3)
- (modified) flang/runtime/unit.cpp (+13-1)
- (modified) flang/runtime/unit.h (+2)
- (modified) flang/runtime/utf.cpp (+11)
- (modified) flang/runtime/utf.h (+3)
``````````diff
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 45d4bd641f6f6..b0b6f9a75b969 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -113,6 +113,84 @@ RT_API_ATTRS int FormatControl<CONTEXT>::GetIntField(
return result;
}
+// Xn, TRn, TLn
+template <typename CONTEXT>
+static RT_API_ATTRS bool RelativeTabbing(CONTEXT &context, int n) {
+ ConnectionState &connection{context.GetConnectionState()};
+ if constexpr (std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Input>> ||
+ std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Output>>) {
+ if (n != 0 && connection.isUTF8) {
+ const char *p{};
+ if (n > 0) { // Xn or TRn
+ std::size_t bytesLeft{context.GetNextInputBytes(p)};
+ // Skip 'n' multi-byte characters. If that's more than are in the
+ // current record, that's valid -- the program can position past the
+ // end and then reposition back with Tn or TLn.
+ for (; n > 0 && bytesLeft; --n) {
+ std::size_t byteCount{MeasureUTF8Bytes(*p)};
+ if (byteCount > bytesLeft) {
+ break;
+ }
+ context.HandleRelativePosition(byteCount);
+ bytesLeft -= byteCount;
+ // Don't call GotChar(byteCount), these don't count towards SIZE=
+ p += byteCount;
+ }
+ } else { // n < 0: TLn
+ n = -n;
+ if (std::int64_t excess{connection.positionInRecord -
+ connection.recordLength.value_or(connection.positionInRecord)};
+ excess > 0) {
+ // Have tabbed past the end of the record
+ if (excess >= n) {
+ context.HandleRelativePosition(-n);
+ return true;
+ }
+ context.HandleRelativePosition(-excess);
+ n -= excess;
+ }
+ std::size_t bytesLeft{context.GetPreviousInputBytes(p)};
+ // Go back 'n' multi-byte characters.
+ for (; n > 0 && bytesLeft; --n) {
+ std::size_t byteCount{MeasurePreviousUTF8Bytes(p, bytesLeft)};
+ context.HandleRelativePosition(-byteCount);
+ bytesLeft -= byteCount;
+ p -= byteCount;
+ }
+ }
+ }
+ }
+ if (connection.internalIoCharKind > 1) {
+ n *= connection.internalIoCharKind;
+ }
+ context.HandleRelativePosition(n);
+ return true;
+}
+
+// Tn
+template <typename CONTEXT>
+static RT_API_ATTRS bool AbsoluteTabbing(CONTEXT &context, int n) {
+ ConnectionState &connection{context.GetConnectionState()};
+ n = n > 0 ? n - 1 : 0; // convert 1-based position to 0-based offset
+ if constexpr (std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Input>> ||
+ std::is_same_v<CONTEXT,
+ ExternalFormattedIoStatementState<Direction::Output>>) {
+ if (connection.isUTF8) {
+ // Reset to the beginning of the record, then TR(n-1)
+ connection.HandleAbsolutePosition(0);
+ return RelativeTabbing(context, n);
+ }
+ }
+ if (connection.internalIoCharKind > 1) {
+ n *= connection.internalIoCharKind;
+ }
+ context.HandleAbsolutePosition(n);
+ return true;
+}
+
template <typename CONTEXT>
static RT_API_ATTRS void HandleControl(
CONTEXT &context, char ch, char next, int n) {
@@ -169,12 +247,7 @@ static RT_API_ATTRS void HandleControl(
}
break;
case 'X':
- if (!next) {
- ConnectionState &connection{context.GetConnectionState()};
- if (connection.internalIoCharKind > 1) {
- n *= connection.internalIoCharKind;
- }
- context.HandleRelativePosition(n);
+ if (!next && RelativeTabbing(context, n)) {
return;
}
break;
@@ -190,19 +263,13 @@ static RT_API_ATTRS void HandleControl(
break;
case 'T': {
if (!next) { // Tn
- --n; // convert 1-based to 0-based
- }
- ConnectionState &connection{context.GetConnectionState()};
- if (connection.internalIoCharKind > 1) {
- n *= connection.internalIoCharKind;
- }
- if (!next) { // Tn
- context.HandleAbsolutePosition(n);
- return;
- }
- if (next == 'L' || next == 'R') { // TLn & TRn
- context.HandleRelativePosition(next == 'L' ? -n : n);
- return;
+ if (AbsoluteTabbing(context, n)) {
+ return;
+ }
+ } else if (next == 'R' || next == 'L') { // TRn / TLn
+ if (RelativeTabbing(context, next == 'L' ? -n : n)) {
+ return;
+ }
}
} break;
default:
diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp
index 4097ea659edd4..1c569789d95c7 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -80,6 +80,7 @@ RT_API_ATTRS bool InternalDescriptorUnit<DIR>::Emit(
template <Direction DIR>
RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
const char *&p, IoErrorHandler &handler) {
+ p = nullptr;
if constexpr (DIR == Direction::Output) {
handler.Crash("InternalDescriptorUnit<Direction::Output>::"
"GetNextInputBytes() called");
@@ -98,6 +99,28 @@ RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
}
}
+template <Direction DIR>
+RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetPreviousInputBytes(
+ const char *&p, IoErrorHandler &handler) {
+ p = nullptr;
+ if constexpr (DIR == Direction::Output) {
+ handler.Crash("InternalDescriptorUnit<Direction::Output>::"
+ "GetPreviousInputBytes() called");
+ return 0;
+ } else {
+ const char *record{CurrentRecord()};
+ if (!record) {
+ handler.SignalEnd();
+ return 0;
+ } else {
+ if (positionInRecord < recordLength.value_or(positionInRecord)) {
+ p = &record[positionInRecord];
+ }
+ return positionInRecord - leftTabLimit.value_or(0);
+ }
+ }
+}
+
template <Direction DIR>
RT_API_ATTRS bool InternalDescriptorUnit<DIR>::AdvanceRecord(
IoErrorHandler &handler) {
diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h
index bcd38b62468af..f33d8df6b7e4e 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -31,6 +31,8 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
RT_API_ATTRS bool Emit(const char *, std::size_t, IoErrorHandler &);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+ RT_API_ATTRS std::size_t GetPreviousInputBytes(
+ const char *&, IoErrorHandler &);
RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
RT_API_ATTRS std::int64_t InquirePos();
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 1a5d32ecd8c5a..9c63409166821 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -32,6 +32,11 @@ std::size_t IoStatementBase::GetNextInputBytes(const char *&p) {
return 0;
}
+std::size_t IoStatementBase::GetPreviousInputBytes(const char *&p) {
+ p = nullptr;
+ return 0;
+}
+
bool IoStatementBase::AdvanceRecord(int) { return false; }
void IoStatementBase::BackspaceRecord() {}
@@ -105,6 +110,8 @@ std::size_t InternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
return unit_.GetNextInputBytes(p, *this);
}
+// InternalIoStatementState<DIR>::GetPreviousInputBytes() not needed or defined
+
template <Direction DIR>
bool InternalIoStatementState<DIR>::AdvanceRecord(int n) {
while (n-- > 0) {
@@ -413,6 +420,12 @@ std::size_t ExternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
return unit().GetNextInputBytes(p, *this);
}
+template <Direction DIR>
+std::size_t ExternalIoStatementState<DIR>::GetPreviousInputBytes(
+ const char *&p) {
+ return unit().GetPreviousInputBytes(p, *this);
+}
+
template <Direction DIR>
bool ExternalIoStatementState<DIR>::AdvanceRecord(int n) {
while (n-- > 0) {
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 6053aeb777b7a..bec466f39683a 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -93,6 +93,7 @@ class IoStatementState {
const char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
RT_API_ATTRS bool AdvanceRecord(int = 1);
RT_API_ATTRS void BackspaceRecord();
RT_API_ATTRS void HandleRelativePosition(std::int64_t byteOffset);
@@ -132,9 +133,9 @@ class IoStatementState {
RT_API_ATTRS Fortran::common::optional<char32_t> GetCurrentChar(
std::size_t &byteCount);
- // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
- // are always in units of bytes, not characters; the distinction matters
- // for internal input from CHARACTER(KIND=2 and 4).
+ // The result of CueUpInput() and the "remaining" arguments to SkipSpaces()
+ // and NextInField() are always in units of bytes, not characters; the
+ // distinction matters for internal input from CHARACTER(KIND=2 and 4).
// For fixed-width fields, return the number of remaining bytes.
// Skip over leading blanks.
@@ -279,6 +280,7 @@ class IoStatementBase : public IoErrorHandler {
RT_API_ATTRS bool Receive(
char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
RT_API_ATTRS bool AdvanceRecord(int);
RT_API_ATTRS void BackspaceRecord();
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -473,6 +475,7 @@ class ExternalIoStatementState : public ExternalIoStatementBase,
RT_API_ATTRS bool Emit(
const char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
RT_API_ATTRS bool AdvanceRecord(int = 1);
RT_API_ATTRS void BackspaceRecord();
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -539,6 +542,7 @@ class ChildIoStatementState : public IoStatementBase,
RT_API_ATTRS bool Emit(
const char *, std::size_t bytes, std::size_t elementBytes = 0);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+ RT_API_ATTRS std::size_t GetPreviousInputBytes(const char *&);
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 5c5bca835f3d8..78fc2ea18b71a 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -134,7 +134,8 @@ bool ExternalFileUnit::Receive(char *data, std::size_t bytes,
std::size_t ExternalFileUnit::GetNextInputBytes(
const char *&p, IoErrorHandler &handler) {
- RUNTIME_CHECK(handler, direction_ == Direction::Input);
+ // Don't require that the current direction be Input; this is also needed
+ // for relative tabbing on output to UTF-8.
std::size_t length{1};
if (auto recl{EffectiveRecordLength()}) {
if (positionInRecord < *recl) {
@@ -148,6 +149,17 @@ std::size_t ExternalFileUnit::GetNextInputBytes(
return p ? length : 0;
}
+std::size_t ExternalFileUnit::GetPreviousInputBytes(
+ const char *&p, IoErrorHandler &handler) {
+ RUNTIME_CHECK(handler, direction_ == Direction::Input);
+ if (positionInRecord <= recordLength.value_or(positionInRecord)) {
+ p = Frame() + recordOffsetInFrame_ + positionInRecord;
+ } else {
+ p = nullptr;
+ }
+ return positionInRecord - leftTabLimit.value_or(0);
+}
+
const char *ExternalFileUnit::FrameNextInput(
IoErrorHandler &handler, std::size_t bytes) {
RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index 83f839e205a48..ae050d3bd7653 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -166,6 +166,8 @@ class ExternalFileUnit : public ConnectionState,
RT_API_ATTRS bool Receive(
char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+ RT_API_ATTRS std::size_t GetPreviousInputBytes(
+ const char *&, IoErrorHandler &);
RT_API_ATTRS bool BeginReadingRecord(IoErrorHandler &);
RT_API_ATTRS void FinishReadingRecord(IoErrorHandler &);
RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index f4b38d5225ce1..b09819cb2f736 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -44,6 +44,17 @@ RT_OFFLOAD_VAR_GROUP_END
#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
RT_OFFLOAD_API_GROUP_BEGIN
+
+std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
+ // Scan back over UTF-8 continuation bytes, if any
+ for (std::size_t n{1}; n <= limit; ++n) {
+ if ((end[-n] & 0xc0) != 0x80) {
+ return n;
+ }
+ }
+ return limit;
+}
+
// Non-minimal encodings are accepted.
Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 29670d54b3eb6..10c2d61484217 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -58,6 +58,9 @@ static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
}
+RT_API_ATTRS std::size_t MeasurePreviousUTF8Bytes(
+ const char *end, std::size_t limit);
+
// Ensure that all bytes are present in sequence in the input buffer
// before calling; use MeasureUTF8Bytes(first byte) to count them.
RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);
``````````
</details>
https://github.com/llvm/llvm-project/pull/101388
More information about the flang-commits
mailing list