[llvm] Create a CharSetConverter class with both iconv and icu support. (PR #138893)

Sergei Barannikov via llvm-commits llvm-commits at lists.llvm.org
Fri May 9 13:34:19 PDT 2025


================
@@ -0,0 +1,140 @@
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class EncodingConverterImplBase {
+
+private:
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[out] Result container for converted string
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// If the destination charset is a stateful character set, the shift state
+  /// will be set to the initial state.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+  virtual std::error_code convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) = 0;
+
+  /// Resets the converter to the initial state.
+  virtual void reset() = 0;
+
+public:
+  virtual ~EncodingConverterImplBase() = default;
+
+  /// Converts a string and resets the converter to the initial state.
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
+    auto EC = convertString(Source, Result);
+    reset();
+    return EC;
+  }
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+enum class TextEncoding {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+
+/// Utility class to convert between different character set encodings.
----------------
s-barannikov wrote:

```suggestion
/// Utility class to convert between different character encodings.
```

https://github.com/llvm/llvm-project/pull/138893


More information about the llvm-commits mailing list