[llvm] a9ee8e4 - Create a EncodingConverter class with both iconv and icu support. (#138893)

via llvm-commits llvm-commits at lists.llvm.org
Tue May 20 11:02:25 PDT 2025


Author: Abhina Sree
Date: 2025-05-20T14:02:22-04:00
New Revision: a9ee8e4a454ec01fefba8829d2847527aa80623f

URL: https://github.com/llvm/llvm-project/commit/a9ee8e4a454ec01fefba8829d2847527aa80623f
DIFF: https://github.com/llvm/llvm-project/commit/a9ee8e4a454ec01fefba8829d2847527aa80623f.diff

LOG: Create a EncodingConverter class with both iconv and icu support. (#138893)

This patch adds a wrapper class called EncodingConverter for
ConverterEBCDIC. This class is then extended to support the ICU library
or iconv library. The ICU library currently takes priority over the
iconv library.

Relevant RFCs:

https://discourse.llvm.org/t/rfc-adding-a-charset-converter-to-the-llvm-support-library/69795

https://discourse.llvm.org/t/rfc-enabling-fexec-charset-support-to-llvm-and-clang-reposting/71512

Stacked PR to enable fexec-charset that depends on this:
https://github.com/llvm/llvm-project/pull/138895

See old PR for review and commit history:
https://github.com/llvm/llvm-project/pull/74516

Added: 
    llvm/include/llvm/Support/TextEncoding.h
    llvm/lib/Support/TextEncoding.cpp
    llvm/unittests/Support/TextEncodingTest.cpp

Modified: 
    llvm/CMakeLists.txt
    llvm/cmake/config-ix.cmake
    llvm/include/llvm/Config/config.h.cmake
    llvm/lib/Support/CMakeLists.txt
    llvm/unittests/Support/CMakeLists.txt
    llvm/unittests/Support/ConvertEBCDICTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 91bedba8a548d..ed44b16bf9aeb 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -592,6 +592,10 @@ else()
   option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 endif()
 
+set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")

diff  --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 43311dad457ec..9d59fea8799b1 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
   set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
 endif()
 
+if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+  message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
+endif()
+
+# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
+if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
+  set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+    find_package(ICU REQUIRED COMPONENTS uc i18n)
+    if (NOT ICU_FOUND)
+      message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
+    endif()
+  else()
+    find_package(ICU COMPONENTS uc i18n)
+  endif()
+  set(HAVE_ICU ${ICU_FOUND})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
+endif()
+
+# Check only for builtin iconv to avoid licensing issues.
+if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
+  if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+    find_package(Iconv REQUIRED)
+    if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
+      message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+    endif()
+  else()
+    find_package(Iconv)
+  endif()
+  if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
+    set(HAVE_ICONV 1)
+  endif()
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)

diff  --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 7efac55ab0352..06d4756397911 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -236,6 +236,12 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
+/* Define if ICU library is available */
+#cmakedefine01 HAVE_ICU
+
+/* Define if iconv library is available */
+#cmakedefine01 HAVE_ICONV
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 

diff  --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h
new file mode 100644
index 0000000000000..e204b95dd2dd7
--- /dev/null
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -0,0 +1,140 @@
+//===-- TextEncoding.h - Text encoding conversion class -----------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between 
diff erent character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_TEXT_ENCODING_H
+#define LLVM_SUPPORT_TEXT_ENCODING_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class TextEncodingConverterImplBase {
+
+private:
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[out] Result container for converted string
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// If the destination encoding is stateful, the shift state will be set
+  /// to the initial state.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+  virtual std::error_code convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) = 0;
+
+  /// Resets the converter to the initial state.
+  virtual void reset() = 0;
+
+public:
+  virtual ~TextEncodingConverterImplBase() = default;
+
+  /// Converts a string and resets the converter to the initial state.
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
+    auto EC = convertString(Source, Result);
+    reset();
+    return EC;
+  }
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+enum class TextEncoding {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+
+/// Utility class to convert between 
diff erent character encodings.
+class TextEncodingConverter {
+  std::unique_ptr<details::TextEncodingConverterImplBase> Converter;
+
+  TextEncodingConverter(
+      std::unique_ptr<details::TextEncodingConverterImplBase> Converter)
+      : Converter(std::move(Converter)) {}
+
+public:
+  /// Creates a TextEncodingConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] From the source character encoding
+  /// \param[in] To the target character encoding
+  /// \return a TextEncodingConverter instance or an error code
+  static ErrorOr<TextEncodingConverter> create(TextEncoding From,
+                                               TextEncoding To);
+
+  /// Creates a TextEncodingConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] From name of the source character encoding
+  /// \param[in] To name of the target character encoding
+  /// \return a TextEncodingConverter instance or an error code
+  static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To);
+
+  TextEncodingConverter(const TextEncodingConverter &) = delete;
+  TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;
+
+  TextEncodingConverter(TextEncodingConverter &&Other)
+      : Converter(std::move(Other.Converter)) {}
+
+  TextEncodingConverter &operator=(TextEncodingConverter &&Other) {
+    if (this != &Other)
+      Converter = std::move(Other.Converter);
+    return *this;
+  }
+
+  ~TextEncodingConverter() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[out] Result container for converted string
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const {
+    return Converter->convert(Source, Result);
+  }
+
+  ErrorOr<std::string> convert(StringRef Source) const {
+    SmallString<100> Result;
+    auto EC = Converter->convert(Source, Result);
+    if (!EC)
+      return std::string(Result);
+    return EC;
+  }
+};
+
+} // namespace llvm
+
+#endif

diff  --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 241f3ebc95395..45d961e994a1a 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -257,6 +257,7 @@ add_llvm_component_library(LLVMSupport
   SuffixTree.cpp
   SystemUtils.cpp
   TarWriter.cpp
+  TextEncoding.cpp
   ThreadPool.cpp
   TimeProfiler.cpp
   Timer.cpp
@@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
+# Link ICU library if it is an external library.
+if(ICU_FOUND)
+  target_link_libraries(LLVMSupport
+  PRIVATE
+  ${ICU_LIBRARIES}
+  )
+endif()
+
 set(llvm_system_libs ${system_libs})
 
 # This block is only needed for llvm-config. When we deprecate llvm-config and

diff  --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
new file mode 100644
index 0000000000000..969dd419ede72
--- /dev/null
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -0,0 +1,357 @@
+//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between 
diff erent character
+/// encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TextEncoding.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#if HAVE_ICU
+#include <unicode/ucnv.h>
+#elif HAVE_ICONV
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the encoding name to enum constant if possible.
+static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(Name, Normalized);
+  if (Normalized.equals("utf8"))
+    return TextEncoding::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return TextEncoding::IBM1047;
+  return std::nullopt;
+}
+
+LLVM_ATTRIBUTE_UNUSED static void
+HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+               SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity =
+      (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
+  Result.resize(0);
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
+namespace {
+enum ConversionType {
+  UTF8ToIBM1047,
+  IBM1047ToUTF8,
+};
+
+// Support conversion between EBCDIC 1047 and UTF-8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned encodings. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// encodings are not supported.
+class TextEncodingConverterTable final
+    : public details::TextEncodingConverterImplBase {
+  const ConversionType ConvType;
+
+public:
+  TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override {}
+};
+
+std::error_code
+TextEncodingConverterTable::convertString(StringRef Source,
+                                          SmallVectorImpl<char> &Result) {
+  switch (ConvType) {
+  case IBM1047ToUTF8:
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  case UTF8ToIBM1047:
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+#if HAVE_ICU
+struct UConverterDeleter {
+  void operator()(UConverter *Converter) const {
+    if (Converter)
+      ucnv_close(Converter);
+  }
+};
+using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
+
+class TextEncodingConverterICU final
+    : public details::TextEncodingConverterImplBase {
+  UConverterUniquePtr FromConvDesc;
+  UConverterUniquePtr ToConvDesc;
+
+public:
+  TextEncodingConverterICU(UConverterUniquePtr FromConverter,
+                           UConverterUniquePtr ToConverter)
+      : FromConvDesc(std::move(FromConverter)),
+        ToConvDesc(std::move(ToConverter)) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
+};
+
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and resume the conversion for the remaining string.
+// TODO: Improve translation of ICU errors to error_code
+std::error_code
+TextEncodingConverterICU::convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) {
+  // Setup the input in case it has no backing data.
+  size_t InputLength = Source.size();
+  const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
+
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  size_t OutputLength = Capacity;
+  Result.resize_for_overwrite(Capacity);
+  char *Output;
+  UErrorCode EC = U_ZERO_ERROR;
+
+  ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+                      &EC);
+  ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
+                        NULL, &EC);
+  assert(U_SUCCESS(EC));
+
+  do {
+    EC = U_ZERO_ERROR;
+    const char *Input = In;
+
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
+                   In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true,
+                   /*flush=*/true, &EC);
+    if (U_FAILURE(EC)) {
+      if (EC == U_BUFFER_OVERFLOW_ERROR) {
+        if (Capacity < Result.max_size()) {
+          HandleOverflow(Capacity, Output, OutputLength, Result);
+          continue;
+        } else
+          return std::error_code(E2BIG, std::generic_category());
+      }
+      // Some other error occured.
+      Result.resize(Output - Result.data());
+      return std::error_code(EILSEQ, std::generic_category());
+    }
+    break;
+  } while (true);
+
+  Result.resize(Output - Result.data());
+  return std::error_code();
+}
+
+void TextEncodingConverterICU::reset() {
+  ucnv_reset(&*FromConvDesc);
+  ucnv_reset(&*ToConvDesc);
+}
+
+#elif HAVE_ICONV
+class TextEncodingConverterIconv final
+    : public details::TextEncodingConverterImplBase {
+  class UniqueIconvT {
+    iconv_t ConvDesc;
+
+  public:
+    operator iconv_t() const { return ConvDesc; }
+    UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
+    ~UniqueIconvT() {
+      if (ConvDesc != (iconv_t)-1) {
+        iconv_close(ConvDesc);
+        ConvDesc = (iconv_t)-1;
+      }
+    }
+    UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
+      Other.ConvDesc = (iconv_t)-1;
+    }
+    UniqueIconvT &operator=(UniqueIconvT &&Other) {
+      if (&Other != this) {
+        ConvDesc = Other.ConvDesc;
+        Other.ConvDesc = (iconv_t)-1;
+      }
+      return *this;
+    }
+  };
+  UniqueIconvT ConvDesc;
+
+public:
+  TextEncodingConverterIconv(UniqueIconvT ConvDesc)
+      : ConvDesc(std::move(ConvDesc)) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
+};
+
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and resume the conversion for the remaining string.
+std::error_code
+TextEncodingConverterIconv::convertString(StringRef Source,
+                                          SmallVectorImpl<char> &Result) {
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  char *Output = static_cast<char *>(Result.data());
+  size_t OutputLength = Capacity;
+  Result.resize_for_overwrite(Capacity);
+
+  size_t Ret;
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
+                      this](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < Result.max_size()) {
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+        // Reset converter
+        reset();
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        Result.resize(Output - Result.data());
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  do {
+    // Setup the input. Use nullptr to reset iconv state if input length is
+    // zero.
+    size_t InputLength = Source.size();
+    char *Input = InputLength ? const_cast<char *>(Source.data()) : "";
+    Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
+    if (Ret != 0) {
+      if (auto EC = HandleError(Ret))
+        return EC;
+      continue;
+    }
+    // Flush the converter
+    Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
+    if (Ret != 0) {
+      if (auto EC = HandleError(Ret))
+        return EC;
+      continue;
+    }
+    break;
+  } while (true);
+
+  // Re-adjust size to actual size.
+  Result.resize(Output - Result.data());
+  return std::error_code();
+}
+
+inline void TextEncodingConverterIconv::reset() {
+  iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+ErrorOr<TextEncodingConverter>
+TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
+
+  // Text encodings should be distinct.
+  if (CPFrom == CPTo)
+    return std::make_error_code(std::errc::invalid_argument);
+
+  ConversionType Conversion;
+  if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
+    Conversion = UTF8ToIBM1047;
+  else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
+    Conversion = IBM1047ToUTF8;
+  else
+    return std::make_error_code(std::errc::invalid_argument);
+
+  return TextEncodingConverter(
+      std::make_unique<TextEncodingConverterTable>(Conversion));
+}
+
+ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
+                                                             StringRef To) {
+  std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
+  std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
+  if (FromEncoding && ToEncoding) {
+    ErrorOr<TextEncodingConverter> Converter =
+        create(*FromEncoding, *ToEncoding);
+    if (Converter)
+      return Converter;
+  }
+#if HAVE_ICU
+  UErrorCode EC = U_ZERO_ERROR;
+  UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
+  if (U_FAILURE(EC))
+    return std::make_error_code(std::errc::invalid_argument);
+
+  UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
+  if (U_FAILURE(EC))
+    return std::make_error_code(std::errc::invalid_argument);
+
+  auto Converter = std::make_unique<TextEncodingConverterICU>(
+      std::move(FromConvDesc), std::move(ToConvDesc));
+  return TextEncodingConverter(std::move(Converter));
+#elif HAVE_ICONV
+  iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
+  if (ConvDesc == (iconv_t)-1)
+    return std::make_error_code(std::errc::invalid_argument);
+  return TextEncodingConverter(
+      std::make_unique<TextEncodingConverterIconv>(ConvDesc));
+#else
+  return std::make_error_code(std::errc::invalid_argument);
+#endif
+}

diff  --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index b6b9398df5e2e..d048e871fd0fb 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_unittest(SupportTests
   SuffixTreeTest.cpp
   SwapByteOrderTest.cpp
   TarWriterTest.cpp
+  TextEncodingTest.cpp
   ThreadPool.cpp
   ThreadSafeAllocatorTest.cpp
   Threading.cpp

diff  --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp
index eec76879ac92c..557f29c391f9c 100644
--- a/llvm/unittests/Support/ConvertEBCDICTest.cpp
+++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp
@@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
 // String with Cyrillic character ya.
 static const char CyrillicUTF[] = "\xd0\xaf";
 
-TEST(CharSet, FromUTF8) {
+TEST(ConverterEBCDIC, convertToEBCDIC) {
   // Hello string.
   StringRef Src(HelloA);
   SmallString<64> Dst;
@@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) {
   Dst.clear();
 }
 
-TEST(CharSet, ToUTF8) {
+TEST(ConverterEBCDIC, convertFromEBCDIC) {
   // Hello string.
   StringRef Src(HelloE);
   SmallString<64> Dst;

diff  --git a/llvm/unittests/Support/TextEncodingTest.cpp b/llvm/unittests/Support/TextEncodingTest.cpp
new file mode 100644
index 0000000000000..a453c0a34a5fe
--- /dev/null
+++ b/llvm/unittests/Support/TextEncodingTest.cpp
@@ -0,0 +1,299 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TextEncoding.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Config/config.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+    "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+    "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+    "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+    "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+    "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+    "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+    "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+    "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+    "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+    "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+    "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+                              "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthISO2022[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+    "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char EarthUTFExtraPartial[] =
+    "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
+
+TEST(Encoding, FromUTF8) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  ErrorOr<TextEncodingConverter> Conv =
+      TextEncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
+
+  // Converter should always exist between UTF-8 and IBM-1047
+  EXPECT_TRUE(Conv);
+
+  std::error_code EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrA;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentUTF;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Cyrillic string. Results in error because not representable in 1047.
+  Src = CyrillicUTF;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(Encoding, ToUTF8) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  ErrorOr<TextEncodingConverter> Conv =
+      TextEncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
+
+  // Converter should always exist between UTF-8 and IBM-1047
+  EXPECT_TRUE(Conv);
+
+  std::error_code EC = Conv->convert(Src, Dst);
+
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrE;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentE;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(Encoding, RoundTrip) {
+  ErrorOr<TextEncodingConverter> ConvToUTF16 =
+      TextEncodingConverter::create("IBM-1047", "UTF-16");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvToUTF16);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  ErrorOr<TextEncodingConverter> ConvToUTF32 =
+      TextEncodingConverter::create("UTF-16", "UTF-32");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvToUTF32);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  ErrorOr<TextEncodingConverter> ConvToEBCDIC =
+      TextEncodingConverter::create("UTF-32", "IBM-1047");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvToEBCDIC);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(Encoding, ShiftState2022) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<8> Dst;
+
+  ErrorOr<TextEncodingConverter> ConvTo2022 =
+      TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvTo2022);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(Encoding, InvalidInput) {
+  // Earth string.
+  StringRef Src(EarthUTFExtraPartial);
+  SmallString<8> Dst;
+
+  ErrorOr<TextEncodingConverter> ConvTo2022 =
+      TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvTo2022);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  // Check that the string failed to convert.
+  std::error_code EC = ConvTo2022->convert(Src, Dst);
+  EXPECT_TRUE(EC);
+}
+
+TEST(Encoding, InvalidOutput) {
+  // Cyrillic in UTF-16
+  ErrorOr<TextEncodingConverter> ConvToUTF16 =
+      TextEncodingConverter::create("UTF-8", "UTF-16");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvToUTF16);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  ErrorOr<TextEncodingConverter> ConvToEBCDIC =
+      TextEncodingConverter::create("UTF-16", "IBM-1047");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvToEBCDIC);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  // Cyrillic string. Convert to UTF-16 and check if properly converted
+  StringRef Src(CyrillicUTF);
+  SmallString<8> Dst, Dst1;
+  std::error_code EC = ConvToUTF16->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+
+  // Cyrillic string. Results in error because not representable in 1047.
+  EC = ConvToEBCDIC->convert(Dst, Dst1);
+  EXPECT_TRUE(EC);
+}
+
+TEST(Encoding, ShiftStateIBM939) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<TextEncodingConverter> ConvToIBM939 =
+      TextEncodingConverter::create("UTF-8", "IBM-939");
+
+#if HAVE_ICU
+  EXPECT_TRUE(ConvToIBM939);
+#else
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToIBM939) {
+    ASSERT_EQ(ConvToIBM939.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+#endif
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvToIBM939->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+} // namespace


        


More information about the llvm-commits mailing list