[llvm] Create a EncodingConverter class with both iconv and icu support. (PR #138893)

Thu May 15 06:08:55 PDT 2025

https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/138893

>From 3192c7be06dd208a559442d067b2dba63bfe20dc Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 7 May 2025 11:16:28 -0400
Subject: [PATCH 1/5] Create a CharSetConverter class with both iconv and icu
 support.

---
 llvm/CMakeLists.txt                          |   4 +
 llvm/cmake/config-ix.cmake                   |  35 ++
 llvm/include/llvm/Config/config.h.cmake      |   6 +
 llvm/include/llvm/Support/CharSet.h          | 141 ++++++++
 llvm/lib/Support/CMakeLists.txt              |   9 +
 llvm/lib/Support/CharSet.cpp                 | 344 +++++++++++++++++++
 llvm/unittests/Support/CMakeLists.txt        |   1 +
 llvm/unittests/Support/CharSetTest.cpp       | 232 +++++++++++++
 llvm/unittests/Support/ConvertEBCDICTest.cpp |   4 +-
 9 files changed, 774 insertions(+), 2 deletions(-)
 create mode 100644 llvm/include/llvm/Support/CharSet.h
 create mode 100644 llvm/lib/Support/CharSet.cpp
 create mode 100644 llvm/unittests/Support/CharSetTest.cpp

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index e8d9ec0d6153a..894c0e1d2e5ae 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -592,6 +592,10 @@ else()
   option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 endif()
 
+set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 43311dad457ec..f7e826b34d26f 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
   set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
 endif()
 
+if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+  message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
+endif()
+
+# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
+if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
+  set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+    find_package(ICU REQUIRED COMPONENTS uc i18n)
+    if (NOT ICU_FOUND)
+      message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
+    endif()
+  else()
+    find_package(ICU COMPONENTS uc i18n)
+  endif()
+  set(HAVE_ICU ${ICU_FOUND})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
+endif()
+
+# Check for builtin iconv to avoid licensing issues.
+if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
+  if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+    find_package(Iconv REQUIRED)
+    if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
+      message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+    endif()
+  else()
+    find_package(Iconv)
+  endif()
+  if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
+    set(HAVE_ICONV 1)
+  endif()
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 7efac55ab0352..3f70a0150da4f 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -236,6 +236,12 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
+/* Define if ICU library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 0000000000000..6a28cd19f4143
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,141 @@
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+
+private:
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[out] Result container for converted string
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// If the destination charset is a stateful character set, the shift state
+  /// will be set to the initial state.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+  virtual std::error_code convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) = 0;
+
+  /// Resets the converter to the initial state.
+  virtual void reset() = 0;
+
+public:
+  virtual ~CharSetConverterImplBase() = default;
+
+  /// Converts a string and resets the converter to the initial state.
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
+    auto EC = convertString(Source, Result);
+    reset();
+    return EC;
+  }
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+class CharSetConverter {
+  std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+  CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+      : Converter(std::move(Converter)) {}
+
+public:
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CSFrom the source character encoding
+  /// \param[in] CSTo the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom,
+                                          text_encoding::id CSTo);
+
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CPFrom name of the source character encoding
+  /// \param[in] CPTo name of the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+  CharSetConverter(const CharSetConverter &) = delete;
+  CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+  CharSetConverter(CharSetConverter &&Other)
+      : Converter(std::move(Other.Converter)) {}
+
+  CharSetConverter &operator=(CharSetConverter &&Other) {
+    if (this != &Other)
+      Converter = std::move(Other.Converter);
+    return *this;
+  }
+
+  ~CharSetConverter() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[out] Result container for converted string
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const {
+    return Converter->convert(Source, Result);
+  }
+
+  ErrorOr<std::string> convert(StringRef Source) const {
+    SmallString<100> Result;
+    auto EC = Converter->convert(Source, Result);
+    if (!EC)
+      return std::string(Result);
+    return EC;
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index df1e65f3a588c..9a7d26a35bf1a 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -162,6 +162,7 @@ add_llvm_component_library(LLVMSupport
   CachePruning.cpp
   Caching.cpp
   circular_raw_ostream.cpp
+  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
@@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
+# Link ICU library if it is an external library.
+if(ICU_FOUND)
+  target_link_libraries(LLVMSupport
+  PRIVATE
+  ${ICU_LIBRARIES}
+  )
+endif()
+
 set(llvm_system_libs ${system_libs})
 
 # This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 0000000000000..6810cf9c6e376
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,344 @@
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+  if (Normalized.equals("utf8"))
+    return text_encoding::id::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return text_encoding::id::IBM1047;
+  return std::nullopt;
+}
+
+LLVM_ATTRIBUTE_UNUSED static void
+HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+               SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                 ? 2 * Capacity
+                 : std::numeric_limits<size_t>::max();
+  Result.resize(0);
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
+namespace {
+enum ConversionType {
+  UTF8ToIBM1047,
+  IBM1047ToUTF8,
+};
+
+// Support conversion between EBCDIC 1047 and UTF-8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+  const ConversionType ConvType;
+
+public:
+  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override {}
+};
+
+std::error_code
+CharSetConverterTable::convertString(StringRef Source,
+                                     SmallVectorImpl<char> &Result) {
+  if (ConvType == IBM1047ToUTF8) {
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  } else if (ConvType == UTF8ToIBM1047) {
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+#ifdef HAVE_ICU
+struct UConverterDeleter {
+  void operator()(UConverter *Converter) const {
+    if (Converter)
+      ucnv_close(Converter);
+  }
+};
+using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
+
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+  UConverterUniquePtr FromConvDesc;
+  UConverterUniquePtr ToConvDesc;
+
+public:
+  CharSetConverterICU(UConverterUniquePtr FromConverter,
+                      UConverterUniquePtr ToConverter)
+      : FromConvDesc(std::move(FromConverter)),
+        ToConvDesc(std::move(ToConverter)) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
+};
+
+std::error_code
+CharSetConverterICU::convertString(StringRef Source,
+                                   SmallVectorImpl<char> &Result) {
+  // Setup the input in case it has no backing data.
+  size_t InputLength = Source.size();
+  const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
+
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  size_t OutputLength = Capacity;
+  Result.resize_for_overwrite(Capacity);
+  char *Output = static_cast<char *>(Result.data());
+  UErrorCode EC = U_ZERO_ERROR;
+
+  ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+                      &EC);
+  ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
+                        NULL, &EC);
+  assert(U_SUCCESS(EC));
+
+  do {
+    EC = U_ZERO_ERROR;
+    const char *Input = In;
+
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
+                   In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true,
+                   /*flush=*/true, &EC);
+    if (U_FAILURE(EC)) {
+      if (EC == U_BUFFER_OVERFLOW_ERROR &&
+          Capacity < std::numeric_limits<size_t>::max()) {
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+        continue;
+      }
+      // Some other error occured.
+      Result.resize(Output - Result.data());
+      return std::error_code(EILSEQ, std::generic_category());
+    }
+    break;
+  } while (true);
+
+  Result.resize(Output - Result.data());
+  return std::error_code();
+}
+
+void CharSetConverterICU::reset() {
+  ucnv_reset(&*FromConvDesc);
+  ucnv_reset(&*ToConvDesc);
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+  class UniqueIconvT {
+    iconv_t ConvDesc;
+
+  public:
+    operator iconv_t() const { return ConvDesc; }
+    UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
+    ~UniqueIconvT() {
+      if (ConvDesc != (iconv_t)-1) {
+        iconv_close(ConvDesc);
+        ConvDesc = (iconv_t)-1;
+      }
+    }
+    UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
+      Other.ConvDesc = (iconv_t)-1;
+    }
+    UniqueIconvT &operator=(UniqueIconvT &&Other) {
+      if (&Other != this) {
+        ConvDesc = Other.ConvDesc;
+        Other.ConvDesc = (iconv_t)-1;
+      }
+      return *this;
+    }
+  };
+  UniqueIconvT ConvDesc;
+
+public:
+  CharSetConverterIconv(UniqueIconvT ConvDesc)
+      : ConvDesc(std::move(ConvDesc)) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
+};
+
+std::error_code
+CharSetConverterIconv::convertString(StringRef Source,
+                                     SmallVectorImpl<char> &Result) {
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  char *Output = static_cast<char *>(Result.data());
+  size_t OutputLength = Capacity;
+  Result.resize_for_overwrite(Capacity);
+
+  size_t Ret;
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
+                      this](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+        // Reset converter
+        iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        Result.resize(Output - Result.data());
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  do {
+    // Setup the input. Use nullptr to reset iconv state if input length is
+    // zero.
+    size_t InputLength = Source.size();
+    char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
+    if (Ret != 0) {
+      if (auto EC = HandleError(Ret))
+        return EC;
+      continue;
+    }
+    // Flush the converter
+    Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
+    if (Ret != 0) {
+      if (auto EC = HandleError(Ret))
+        return EC;
+      continue;
+    }
+    break;
+  } while (true);
+
+  // Re-adjust size to actual size.
+  Result.resize(Output - Result.data());
+  return std::error_code();
+}
+
+void CharSetConverterIconv::reset() {
+  iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+ErrorOr<CharSetConverter> CharSetConverter::create(text_encoding::id CPFrom,
+                                                   text_encoding::id CPTo) {
+
+  assert(CPFrom != CPTo && "Text encodings should be distinct");
+
+  ConversionType Conversion;
+  if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+    Conversion = UTF8ToIBM1047;
+  else if (CPFrom == text_encoding::id::IBM1047 &&
+           CPTo == text_encoding::id::UTF8)
+    Conversion = IBM1047ToUTF8;
+  else
+    return std::error_code(errno, std::generic_category());
+
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterTable>(Conversion);
+  return CharSetConverter(std::move(Converter));
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+                                                   StringRef CSTo) {
+  std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
+  std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+  if (From && To) {
+    ErrorOr<CharSetConverter> Converter = create(*From, *To);
+    if (Converter)
+      return Converter;
+  }
+#ifdef HAVE_ICU
+  UErrorCode EC = U_ZERO_ERROR;
+  UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC));
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
+                                            std::move(ToConvDesc));
+  return CharSetConverter(std::move(Converter));
+#elif defined(HAVE_ICONV)
+  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  if (ConvDesc == (iconv_t)-1)
+    return std::error_code(errno, std::generic_category());
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterIconv>(ConvDesc);
+  return CharSetConverter(std::move(Converter));
+#else
+  return std::make_error_code(std::errc::invalid_argument);
+#endif
+}
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index b6b9398df5e2e..09e55f116f780 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_unittest(SupportTests
   CrashRecoveryTest.cpp
   Caching.cpp
   Casting.cpp
+  CharSetTest.cpp
   CheckedArithmeticTest.cpp
   Chrono.cpp
   CommandLineTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
new file mode 100644
index 0000000000000..772d46ec73497
--- /dev/null
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,232 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+    "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+    "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+    "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+    "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+    "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+    "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+    "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+    "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+    "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+    "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+    "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+                              "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthISO2022[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+    "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char EarthUTFExtraPartial[] =
+    "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
+
+TEST(CharSet, FromUTF8) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
+      text_encoding::id::UTF8, text_encoding::id::IBM1047);
+
+  // Stop test if conversion is not supported.
+  if (!Conv) {
+    ASSERT_EQ(Conv.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  std::error_code EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrA;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentUTF;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Cyrillic string. Results in error because not representable in 1047.
+  Src = CyrillicUTF;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(CharSet, ToUTF8) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
+      text_encoding::id::IBM1047, text_encoding::id::UTF8);
+
+  // Stop test if conversion is not supported.
+  if (!Conv) {
+    ASSERT_EQ(Conv.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  std::error_code EC = Conv->convert(Src, Dst);
+
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrE;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentE;
+  EC = Conv->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+  ErrorOr<CharSetConverter> ConvToUTF16 =
+      CharSetConverter::create("IBM-1047", "UTF-16");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToUTF32 =
+      CharSetConverter::create("UTF-16", "UTF-32");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToEBCDIC =
+      CharSetConverter::create("UTF-32", "IBM-1047");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<8> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftState2022Partial) {
+  // Earth string.
+  StringRef Src(EarthUTFExtraPartial);
+  SmallString<8> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst);
+  EXPECT_TRUE(EC);
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvToIBM939 =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToIBM939) {
+    ASSERT_EQ(ConvToIBM939.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvToIBM939->convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+} // namespace
diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp
index eec76879ac92c..557f29c391f9c 100644
--- a/llvm/unittests/Support/ConvertEBCDICTest.cpp
+++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp
@@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
 // String with Cyrillic character ya.
 static const char CyrillicUTF[] = "\xd0\xaf";
 
-TEST(CharSet, FromUTF8) {
+TEST(ConverterEBCDIC, convertToEBCDIC) {
   // Hello string.
   StringRef Src(HelloA);
   SmallString<64> Dst;
@@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) {
   Dst.clear();
 }
 
-TEST(CharSet, ToUTF8) {
+TEST(ConverterEBCDIC, convertFromEBCDIC) {
   // Hello string.
   StringRef Src(HelloE);
   SmallString<64> Dst;

>From 6d40922368d2d0acd511f923791604a149f75667 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 9 May 2025 10:06:18 -0400
Subject: [PATCH 2/5] address comments

---
 llvm/include/llvm/Config/config.h.cmake |  4 +-
 llvm/include/llvm/Support/CharSet.h     |  8 ++--
 llvm/lib/Support/CharSet.cpp            | 59 ++++++++++++++-----------
 llvm/unittests/Support/CharSetTest.cpp  |  4 +-
 4 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 3f70a0150da4f..06d4756397911 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -237,10 +237,10 @@
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
 /* Define if ICU library is available */
-#cmakedefine HAVE_ICU ${HAVE_ICU}
+#cmakedefine01 HAVE_ICU
 
 /* Define if iconv library is available */
-#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+#cmakedefine01 HAVE_ICONV
 
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 6a28cd19f4143..8bb5baceccc20 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -69,15 +69,13 @@ class CharSetConverterImplBase {
 } // namespace details
 
 // Names inspired by https://wg21.link/p1885.
-namespace text_encoding {
-enum class id {
+enum class TextEncoding {
   /// UTF-8 character set encoding.
   UTF8,
 
   /// IBM EBCDIC 1047 character set encoding.
   IBM1047
 };
-} // end namespace text_encoding
 
 /// Utility class to convert between different character set encodings.
 class CharSetConverter {
@@ -93,8 +91,8 @@ class CharSetConverter {
   /// \param[in] CSFrom the source character encoding
   /// \param[in] CSTo the target character encoding
   /// \return a CharSetConverter instance or an error code
-  static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom,
-                                          text_encoding::id CSTo);
+  static ErrorOr<CharSetConverter> create(TextEncoding CSFrom,
+                                          TextEncoding CSTo);
 
   /// Creates a CharSetConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 6810cf9c6e376..2c0b1ad67813c 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -22,9 +22,9 @@
 #include <limits>
 #include <system_error>
 
-#ifdef HAVE_ICU
+#if HAVE_ICU
 #include <unicode/ucnv.h>
-#elif defined(HAVE_ICONV)
+#elif HAVE_ICONV
 #include <iconv.h>
 #endif
 
@@ -47,13 +47,13 @@ static void normalizeCharSetName(StringRef CSName,
 }
 
 // Maps the charset name to enum constant if possible.
-static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+static std::optional<TextEncoding> getKnownCharSet(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
   if (Normalized.equals("utf8"))
-    return text_encoding::id::UTF8;
+    return TextEncoding::UTF8;
   if (Normalized.equals("ibm1047"))
-    return text_encoding::id::IBM1047;
+    return TextEncoding::IBM1047;
   return std::nullopt;
 }
 
@@ -98,17 +98,18 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 std::error_code
 CharSetConverterTable::convertString(StringRef Source,
                                      SmallVectorImpl<char> &Result) {
-  if (ConvType == IBM1047ToUTF8) {
+  switch (ConvType) {
+  case IBM1047ToUTF8:
     ConverterEBCDIC::convertToUTF8(Source, Result);
     return std::error_code();
-  } else if (ConvType == UTF8ToIBM1047) {
+  case UTF8ToIBM1047:
     return ConverterEBCDIC::convertToEBCDIC(Source, Result);
   }
   llvm_unreachable("Invalid ConvType!");
   return std::error_code();
 }
 
-#ifdef HAVE_ICU
+#if HAVE_ICU
 struct UConverterDeleter {
   void operator()(UConverter *Converter) const {
     if (Converter)
@@ -133,6 +134,10 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
   void reset() override;
 };
 
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and redo the conversion for the remaining string.
 std::error_code
 CharSetConverterICU::convertString(StringRef Source,
                                    SmallVectorImpl<char> &Result) {
@@ -144,7 +149,7 @@ CharSetConverterICU::convertString(StringRef Source,
   size_t Capacity = Result.capacity();
   size_t OutputLength = Capacity;
   Result.resize_for_overwrite(Capacity);
-  char *Output = static_cast<char *>(Result.data());
+  char *Output;
   UErrorCode EC = U_ZERO_ERROR;
 
   ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
@@ -185,7 +190,7 @@ void CharSetConverterICU::reset() {
   ucnv_reset(&*ToConvDesc);
 }
 
-#elif defined(HAVE_ICONV)
+#elif HAVE_ICONV
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
   class UniqueIconvT {
     iconv_t ConvDesc;
@@ -222,6 +227,10 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
   void reset() override;
 };
 
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and redo the conversion for the remaining string.
 std::error_code
 CharSetConverterIconv::convertString(StringRef Source,
                                      SmallVectorImpl<char> &Result) {
@@ -289,35 +298,35 @@ void CharSetConverterIconv::reset() {
 #endif // HAVE_ICONV
 } // namespace
 
-ErrorOr<CharSetConverter> CharSetConverter::create(text_encoding::id CPFrom,
-                                                   text_encoding::id CPTo) {
+ErrorOr<CharSetConverter> CharSetConverter::create(TextEncoding CPFrom,
+                                                   TextEncoding CPTo) {
 
-  assert(CPFrom != CPTo && "Text encodings should be distinct");
+  // text encodings should be distinct
+  if(CPFrom == CPTo)
+    return std::make_error_code(std::errc::invalid_argument);
 
   ConversionType Conversion;
-  if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+  if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
     Conversion = UTF8ToIBM1047;
-  else if (CPFrom == text_encoding::id::IBM1047 &&
-           CPTo == text_encoding::id::UTF8)
+  else if (CPFrom == TextEncoding::IBM1047 &&
+           CPTo == TextEncoding::UTF8)
     Conversion = IBM1047ToUTF8;
   else
     return std::error_code(errno, std::generic_category());
 
-  std::unique_ptr<details::CharSetConverterImplBase> Converter =
-      std::make_unique<CharSetConverterTable>(Conversion);
-  return CharSetConverter(std::move(Converter));
+  return CharSetConverter(std::make_unique<CharSetConverterTable>(Conversion));
 }
 
 ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
                                                    StringRef CSTo) {
-  std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
-  std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+  std::optional<TextEncoding> From = getKnownCharSet(CSFrom);
+  std::optional<TextEncoding> To = getKnownCharSet(CSTo);
   if (From && To) {
     ErrorOr<CharSetConverter> Converter = create(*From, *To);
     if (Converter)
       return Converter;
   }
-#ifdef HAVE_ICU
+#if HAVE_ICU
   UErrorCode EC = U_ZERO_ERROR;
   UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
   if (U_FAILURE(EC)) {
@@ -331,13 +340,11 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
       std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
                                             std::move(ToConvDesc));
   return CharSetConverter(std::move(Converter));
-#elif defined(HAVE_ICONV)
+#elif HAVE_ICONV
   iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
   if (ConvDesc == (iconv_t)-1)
     return std::error_code(errno, std::generic_category());
-  std::unique_ptr<details::CharSetConverterImplBase> Converter =
-      std::make_unique<CharSetConverterIconv>(ConvDesc);
-  return CharSetConverter(std::move(Converter));
+  return CharSetConverter(std::make_unique<CharSetConverterIconv>(ConvDesc));
 #else
   return std::make_error_code(std::errc::invalid_argument);
 #endif
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 772d46ec73497..eeaf24acda225 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -59,7 +59,7 @@ TEST(CharSet, FromUTF8) {
   SmallString<64> Dst;
 
   ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
-      text_encoding::id::UTF8, text_encoding::id::IBM1047);
+      TextEncoding::UTF8, TextEncoding::IBM1047);
 
   // Stop test if conversion is not supported.
   if (!Conv) {
@@ -99,7 +99,7 @@ TEST(CharSet, ToUTF8) {
   SmallString<64> Dst;
 
   ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
-      text_encoding::id::IBM1047, text_encoding::id::UTF8);
+      TextEncoding::IBM1047, TextEncoding::UTF8);
 
   // Stop test if conversion is not supported.
   if (!Conv) {

>From 52635f214877a2d46eee5f78d29ee1b5c97150b9 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 9 May 2025 10:38:39 -0400
Subject: [PATCH 3/5] rename CharSetConverter to EncodingConverter

---
 llvm/include/llvm/Support/CharSet.h    | 35 +++++++-------
 llvm/lib/Support/CharSet.cpp           | 64 +++++++++++++-------------
 llvm/unittests/Support/CharSetTest.cpp | 32 ++++++-------
 3 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 8bb5baceccc20..22263a60a1a1f 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -28,7 +28,7 @@ namespace llvm {
 template <typename T> class SmallVectorImpl;
 
 namespace details {
-class CharSetConverterImplBase {
+class EncodingConverterImplBase {
 
 private:
   /// Converts a string.
@@ -57,7 +57,7 @@ class CharSetConverterImplBase {
   virtual void reset() = 0;
 
 public:
-  virtual ~CharSetConverterImplBase() = default;
+  virtual ~EncodingConverterImplBase() = default;
 
   /// Converts a string and resets the converter to the initial state.
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
@@ -78,43 +78,44 @@ enum class TextEncoding {
 };
 
 /// Utility class to convert between different character set encodings.
-class CharSetConverter {
-  std::unique_ptr<details::CharSetConverterImplBase> Converter;
+class EncodingConverter {
+  std::unique_ptr<details::EncodingConverterImplBase> Converter;
 
-  CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+  EncodingConverter(
+      std::unique_ptr<details::EncodingConverterImplBase> Converter)
       : Converter(std::move(Converter)) {}
 
 public:
-  /// Creates a CharSetConverter instance.
+  /// Creates a EncodingConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
   /// not supported.
   /// \param[in] CSFrom the source character encoding
   /// \param[in] CSTo the target character encoding
-  /// \return a CharSetConverter instance or an error code
-  static ErrorOr<CharSetConverter> create(TextEncoding CSFrom,
-                                          TextEncoding CSTo);
+  /// \return a EncodingConverter instance or an error code
+  static ErrorOr<EncodingConverter> create(TextEncoding CSFrom,
+                                           TextEncoding CSTo);
 
-  /// Creates a CharSetConverter instance.
+  /// Creates a EncodingConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
   /// not supported.
   /// \param[in] CPFrom name of the source character encoding
   /// \param[in] CPTo name of the target character encoding
-  /// \return a CharSetConverter instance or an error code
-  static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+  /// \return a EncodingConverter instance or an error code
+  static ErrorOr<EncodingConverter> create(StringRef CPFrom, StringRef CPTo);
 
-  CharSetConverter(const CharSetConverter &) = delete;
-  CharSetConverter &operator=(const CharSetConverter &) = delete;
+  EncodingConverter(const EncodingConverter &) = delete;
+  EncodingConverter &operator=(const EncodingConverter &) = delete;
 
-  CharSetConverter(CharSetConverter &&Other)
+  EncodingConverter(EncodingConverter &&Other)
       : Converter(std::move(Other.Converter)) {}
 
-  CharSetConverter &operator=(CharSetConverter &&Other) {
+  EncodingConverter &operator=(EncodingConverter &&Other) {
     if (this != &Other)
       Converter = std::move(Other.Converter);
     return *this;
   }
 
-  ~CharSetConverter() = default;
+  ~EncodingConverter() = default;
 
   /// Converts a string.
   /// \param[in] Source source string
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 2c0b1ad67813c..ef5123eef26a0 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -47,7 +47,7 @@ static void normalizeCharSetName(StringRef CSName,
 }
 
 // Maps the charset name to enum constant if possible.
-static std::optional<TextEncoding> getKnownCharSet(StringRef CSName) {
+static std::optional<TextEncoding> getKnownEncoding(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
   if (Normalized.equals("utf8"))
@@ -83,11 +83,11 @@ enum ConversionType {
 // aforementioned character sets. The use of tables for conversion is only
 // possible because EBCDIC 1047 is a single-byte, stateless encoding; other
 // character sets are not supported.
-class CharSetConverterTable : public details::CharSetConverterImplBase {
+class EncodingConverterTable : public details::EncodingConverterImplBase {
   const ConversionType ConvType;
 
 public:
-  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+  EncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
 
   std::error_code convertString(StringRef Source,
                                 SmallVectorImpl<char> &Result) override;
@@ -96,8 +96,8 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 };
 
 std::error_code
-CharSetConverterTable::convertString(StringRef Source,
-                                     SmallVectorImpl<char> &Result) {
+EncodingConverterTable::convertString(StringRef Source,
+                                      SmallVectorImpl<char> &Result) {
   switch (ConvType) {
   case IBM1047ToUTF8:
     ConverterEBCDIC::convertToUTF8(Source, Result);
@@ -118,13 +118,13 @@ struct UConverterDeleter {
 };
 using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
 
-class CharSetConverterICU : public details::CharSetConverterImplBase {
+class EncodingConverterICU : public details::EncodingConverterImplBase {
   UConverterUniquePtr FromConvDesc;
   UConverterUniquePtr ToConvDesc;
 
 public:
-  CharSetConverterICU(UConverterUniquePtr FromConverter,
-                      UConverterUniquePtr ToConverter)
+  EncodingConverterICU(UConverterUniquePtr FromConverter,
+                       UConverterUniquePtr ToConverter)
       : FromConvDesc(std::move(FromConverter)),
         ToConvDesc(std::move(ToConverter)) {}
 
@@ -139,8 +139,8 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
 // insufficient buffer size. In the future, it would better to save the partial
 // result and redo the conversion for the remaining string.
 std::error_code
-CharSetConverterICU::convertString(StringRef Source,
-                                   SmallVectorImpl<char> &Result) {
+EncodingConverterICU::convertString(StringRef Source,
+                                    SmallVectorImpl<char> &Result) {
   // Setup the input in case it has no backing data.
   size_t InputLength = Source.size();
   const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
@@ -185,13 +185,13 @@ CharSetConverterICU::convertString(StringRef Source,
   return std::error_code();
 }
 
-void CharSetConverterICU::reset() {
+void EncodingConverterICU::reset() {
   ucnv_reset(&*FromConvDesc);
   ucnv_reset(&*ToConvDesc);
 }
 
 #elif HAVE_ICONV
-class CharSetConverterIconv : public details::CharSetConverterImplBase {
+class EncodingConverterIconv : public details::EncodingConverterImplBase {
   class UniqueIconvT {
     iconv_t ConvDesc;
 
@@ -218,7 +218,7 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
   UniqueIconvT ConvDesc;
 
 public:
-  CharSetConverterIconv(UniqueIconvT ConvDesc)
+  EncodingConverterIconv(UniqueIconvT ConvDesc)
       : ConvDesc(std::move(ConvDesc)) {}
 
   std::error_code convertString(StringRef Source,
@@ -232,8 +232,8 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
 // insufficient buffer size. In the future, it would better to save the partial
 // result and redo the conversion for the remaining string.
 std::error_code
-CharSetConverterIconv::convertString(StringRef Source,
-                                     SmallVectorImpl<char> &Result) {
+EncodingConverterIconv::convertString(StringRef Source,
+                                      SmallVectorImpl<char> &Result) {
   // Setup the output. We directly write into the SmallVector.
   size_t Capacity = Result.capacity();
   char *Output = static_cast<char *>(Result.data());
@@ -291,38 +291,38 @@ CharSetConverterIconv::convertString(StringRef Source,
   return std::error_code();
 }
 
-void CharSetConverterIconv::reset() {
+void EncodingConverterIconv::reset() {
   iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
 }
 
 #endif // HAVE_ICONV
 } // namespace
 
-ErrorOr<CharSetConverter> CharSetConverter::create(TextEncoding CPFrom,
-                                                   TextEncoding CPTo) {
+ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
+                                                     TextEncoding CPTo) {
 
   // text encodings should be distinct
-  if(CPFrom == CPTo)
+  if (CPFrom == CPTo)
     return std::make_error_code(std::errc::invalid_argument);
 
   ConversionType Conversion;
   if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
     Conversion = UTF8ToIBM1047;
-  else if (CPFrom == TextEncoding::IBM1047 &&
-           CPTo == TextEncoding::UTF8)
+  else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
     Conversion = IBM1047ToUTF8;
   else
     return std::error_code(errno, std::generic_category());
 
-  return CharSetConverter(std::make_unique<CharSetConverterTable>(Conversion));
+  return EncodingConverter(
+      std::make_unique<EncodingConverterTable>(Conversion));
 }
 
-ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
-                                                   StringRef CSTo) {
-  std::optional<TextEncoding> From = getKnownCharSet(CSFrom);
-  std::optional<TextEncoding> To = getKnownCharSet(CSTo);
+ErrorOr<EncodingConverter> EncodingConverter::create(StringRef CSFrom,
+                                                     StringRef CSTo) {
+  std::optional<TextEncoding> From = getKnownEncoding(CSFrom);
+  std::optional<TextEncoding> To = getKnownEncoding(CSTo);
   if (From && To) {
-    ErrorOr<CharSetConverter> Converter = create(*From, *To);
+    ErrorOr<EncodingConverter> Converter = create(*From, *To);
     if (Converter)
       return Converter;
   }
@@ -336,15 +336,15 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
   if (U_FAILURE(EC)) {
     return std::error_code(errno, std::generic_category());
   }
-  std::unique_ptr<details::CharSetConverterImplBase> Converter =
-      std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
-                                            std::move(ToConvDesc));
-  return CharSetConverter(std::move(Converter));
+  std::unique_ptr<details::EncodingConverterImplBase> Converter =
+      std::make_unique<EncodingConverterICU>(std::move(FromConvDesc),
+                                             std::move(ToConvDesc));
+  return EncodingConverter(std::move(Converter));
 #elif HAVE_ICONV
   iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
   if (ConvDesc == (iconv_t)-1)
     return std::error_code(errno, std::generic_category());
-  return CharSetConverter(std::make_unique<CharSetConverterIconv>(ConvDesc));
+  return EncodingConverter(std::make_unique<EncodingConverterIconv>(ConvDesc));
 #else
   return std::make_error_code(std::errc::invalid_argument);
 #endif
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index eeaf24acda225..77e5b3064e1e9 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -58,8 +58,8 @@ TEST(CharSet, FromUTF8) {
   StringRef Src(HelloA);
   SmallString<64> Dst;
 
-  ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
-      TextEncoding::UTF8, TextEncoding::IBM1047);
+  ErrorOr<EncodingConverter> Conv =
+      EncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
 
   // Stop test if conversion is not supported.
   if (!Conv) {
@@ -98,8 +98,8 @@ TEST(CharSet, ToUTF8) {
   StringRef Src(HelloE);
   SmallString<64> Dst;
 
-  ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
-      TextEncoding::IBM1047, TextEncoding::UTF8);
+  ErrorOr<EncodingConverter> Conv =
+      EncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
 
   // Stop test if conversion is not supported.
   if (!Conv) {
@@ -129,24 +129,24 @@ TEST(CharSet, ToUTF8) {
 }
 
 TEST(CharSet, RoundTrip) {
-  ErrorOr<CharSetConverter> ConvToUTF16 =
-      CharSetConverter::create("IBM-1047", "UTF-16");
+  ErrorOr<EncodingConverter> ConvToUTF16 =
+      EncodingConverter::create("IBM-1047", "UTF-16");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToUTF16) {
     ASSERT_EQ(ConvToUTF16.getError(),
               std::make_error_code(std::errc::invalid_argument));
     return;
   }
-  ErrorOr<CharSetConverter> ConvToUTF32 =
-      CharSetConverter::create("UTF-16", "UTF-32");
+  ErrorOr<EncodingConverter> ConvToUTF32 =
+      EncodingConverter::create("UTF-16", "UTF-32");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToUTF32) {
     ASSERT_EQ(ConvToUTF32.getError(),
               std::make_error_code(std::errc::invalid_argument));
     return;
   }
-  ErrorOr<CharSetConverter> ConvToEBCDIC =
-      CharSetConverter::create("UTF-32", "IBM-1047");
+  ErrorOr<EncodingConverter> ConvToEBCDIC =
+      EncodingConverter::create("UTF-32", "IBM-1047");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToEBCDIC) {
     ASSERT_EQ(ConvToEBCDIC.getError(),
@@ -175,8 +175,8 @@ TEST(CharSet, ShiftState2022) {
   StringRef Src(EarthUTF);
   SmallString<8> Dst;
 
-  ErrorOr<CharSetConverter> ConvTo2022 =
-      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  ErrorOr<EncodingConverter> ConvTo2022 =
+      EncodingConverter::create("UTF-8", "ISO-2022-JP");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvTo2022) {
     ASSERT_EQ(ConvTo2022.getError(),
@@ -195,8 +195,8 @@ TEST(CharSet, ShiftState2022Partial) {
   StringRef Src(EarthUTFExtraPartial);
   SmallString<8> Dst;
 
-  ErrorOr<CharSetConverter> ConvTo2022 =
-      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  ErrorOr<EncodingConverter> ConvTo2022 =
+      EncodingConverter::create("UTF-8", "ISO-2022-JP");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvTo2022) {
     ASSERT_EQ(ConvTo2022.getError(),
@@ -214,8 +214,8 @@ TEST(CharSet, ShiftStateIBM939) {
   StringRef Src(EarthUTF);
   SmallString<64> Dst;
 
-  ErrorOr<CharSetConverter> ConvToIBM939 =
-      CharSetConverter::create("UTF-8", "IBM-939");
+  ErrorOr<EncodingConverter> ConvToIBM939 =
+      EncodingConverter::create("UTF-8", "IBM-939");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToIBM939) {
     ASSERT_EQ(ConvToIBM939.getError(),

>From a39b13ee1b47f5d1b539b546402f1f15e3827ab6 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 12 May 2025 09:00:21 -0400
Subject: [PATCH 4/5] address comments, rename CharSet to EncodingConverter

---
 .../{CharSet.h => EncodingConverter.h}        | 25 +++++-----
 llvm/lib/Support/CMakeLists.txt               |  2 +-
 .../{CharSet.cpp => EncodingConverter.cpp}    | 46 +++++++++----------
 llvm/unittests/Support/CMakeLists.txt         |  2 +-
 ...rSetTest.cpp => EncodingConverterTest.cpp} | 18 ++++----
 5 files changed, 45 insertions(+), 48 deletions(-)
 rename llvm/include/llvm/Support/{CharSet.h => EncodingConverter.h} (83%)
 rename llvm/lib/Support/{CharSet.cpp => EncodingConverter.cpp} (88%)
 rename llvm/unittests/Support/{CharSetTest.cpp => EncodingConverterTest.cpp} (95%)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/EncodingConverter.h
similarity index 83%
rename from llvm/include/llvm/Support/CharSet.h
rename to llvm/include/llvm/Support/EncodingConverter.h
index 22263a60a1a1f..6ceb7f7f547de 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/EncodingConverter.h
@@ -1,4 +1,4 @@
-//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
+//===-- EncodingConverter.h - Encoding conversion class -----------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_CHARSET_H
-#define LLVM_SUPPORT_CHARSET_H
+#ifndef LLVM_SUPPORT_ENCODING_CONVERTER_H
+#define LLVM_SUPPORT_ENCODING_CONVERTER_H
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -44,8 +44,8 @@ class EncodingConverterImplBase {
   ///   - std::errc::invalid_argument: The input contains an incomplete
   ///     multibyte sequence.
   ///
-  /// If the destination charset is a stateful character set, the shift state
-  /// will be set to the initial state.
+  /// If the destination encoding is stateful, the shift state will be set
+  /// to the initial state.
   ///
   /// In case of an error, the result string contains the successfully converted
   /// part of the input string.
@@ -77,7 +77,7 @@ enum class TextEncoding {
   IBM1047
 };
 
-/// Utility class to convert between different character set encodings.
+/// Utility class to convert between different character encodings.
 class EncodingConverter {
   std::unique_ptr<details::EncodingConverterImplBase> Converter;
 
@@ -89,19 +89,18 @@ class EncodingConverter {
   /// Creates a EncodingConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
   /// not supported.
-  /// \param[in] CSFrom the source character encoding
-  /// \param[in] CSTo the target character encoding
+  /// \param[in] From the source character encoding
+  /// \param[in] To the target character encoding
   /// \return a EncodingConverter instance or an error code
-  static ErrorOr<EncodingConverter> create(TextEncoding CSFrom,
-                                           TextEncoding CSTo);
+  static ErrorOr<EncodingConverter> create(TextEncoding From, TextEncoding To);
 
   /// Creates a EncodingConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
   /// not supported.
-  /// \param[in] CPFrom name of the source character encoding
-  /// \param[in] CPTo name of the target character encoding
+  /// \param[in] From name of the source character encoding
+  /// \param[in] To name of the target character encoding
   /// \return a EncodingConverter instance or an error code
-  static ErrorOr<EncodingConverter> create(StringRef CPFrom, StringRef CPTo);
+  static ErrorOr<EncodingConverter> create(StringRef From, StringRef To);
 
   EncodingConverter(const EncodingConverter &) = delete;
   EncodingConverter &operator=(const EncodingConverter &) = delete;
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 9a7d26a35bf1a..64c25148faa01 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -162,7 +162,6 @@ add_llvm_component_library(LLVMSupport
   CachePruning.cpp
   Caching.cpp
   circular_raw_ostream.cpp
-  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
@@ -187,6 +186,7 @@ add_llvm_component_library(LLVMSupport
   ELFAttributes.cpp
   ELFAttrParserCompact.cpp
   ELFAttrParserExtended.cpp
+  EncodingConverter.cpp
   Error.cpp
   ErrorHandling.cpp
   ExponentialBackoff.cpp
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/EncodingConverter.cpp
similarity index 88%
rename from llvm/lib/Support/CharSet.cpp
rename to llvm/lib/Support/EncodingConverter.cpp
index ef5123eef26a0..838fc89b6df95 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/EncodingConverter.cpp
@@ -1,4 +1,4 @@
-//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
+//===-- EncodingConverter.cpp - Encoding conversion class ---------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,11 +8,11 @@
 ///
 /// \file
 /// This file provides utility classes to convert between different character
-/// set encodings.
+/// encodings.
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/CharSet.h"
+#include "llvm/Support/EncodingConverter.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -46,10 +46,10 @@ static void normalizeCharSetName(StringRef CSName,
   }
 }
 
-// Maps the charset name to enum constant if possible.
-static std::optional<TextEncoding> getKnownEncoding(StringRef CSName) {
+// Maps the encoding name to enum constant if possible.
+static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
   SmallString<16> Normalized;
-  normalizeCharSetName(CSName, Normalized);
+  normalizeCharSetName(Name, Normalized);
   if (Normalized.equals("utf8"))
     return TextEncoding::UTF8;
   if (Normalized.equals("ibm1047"))
@@ -63,9 +63,8 @@ HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
   // No space left in output buffer. Double the size of the underlying
   // memory in the SmallVectorImpl, adjust pointer and length and continue
   // the conversion.
-  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                 ? 2 * Capacity
-                 : std::numeric_limits<size_t>::max();
+  Capacity =
+      (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
   Result.resize(0);
   Result.resize_for_overwrite(Capacity);
   Output = static_cast<char *>(Result.data());
@@ -80,9 +79,9 @@ enum ConversionType {
 
 // Support conversion between EBCDIC 1047 and UTF-8. This class uses
 // built-in translation tables that allow for translation between the
-// aforementioned character sets. The use of tables for conversion is only
+// aforementioned encodings. The use of tables for conversion is only
 // possible because EBCDIC 1047 is a single-byte, stateless encoding; other
-// character sets are not supported.
+// encodings are not supported.
 class EncodingConverterTable : public details::EncodingConverterImplBase {
   const ConversionType ConvType;
 
@@ -169,8 +168,7 @@ EncodingConverterICU::convertString(StringRef Source,
                    /*pivotLimit=*/NULL, /*reset=*/true,
                    /*flush=*/true, &EC);
     if (U_FAILURE(EC)) {
-      if (EC == U_BUFFER_OVERFLOW_ERROR &&
-          Capacity < std::numeric_limits<size_t>::max()) {
+      if (EC == U_BUFFER_OVERFLOW_ERROR && Capacity < Result.max_size()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
         continue;
       }
@@ -246,7 +244,7 @@ EncodingConverterIconv::convertString(StringRef Source,
                       this](size_t Ret) {
     if (Ret == static_cast<size_t>(-1)) {
       // An error occured. Check if we can gracefully handle it.
-      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+      if (errno == E2BIG && Capacity < Result.max_size()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
         // Reset converter
         iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
@@ -301,7 +299,7 @@ void EncodingConverterIconv::reset() {
 ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
                                                      TextEncoding CPTo) {
 
-  // text encodings should be distinct
+  // Text encodings should be distinct.
   if (CPFrom == CPTo)
     return std::make_error_code(std::errc::invalid_argument);
 
@@ -317,22 +315,22 @@ ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
       std::make_unique<EncodingConverterTable>(Conversion));
 }
 
-ErrorOr<EncodingConverter> EncodingConverter::create(StringRef CSFrom,
-                                                     StringRef CSTo) {
-  std::optional<TextEncoding> From = getKnownEncoding(CSFrom);
-  std::optional<TextEncoding> To = getKnownEncoding(CSTo);
-  if (From && To) {
-    ErrorOr<EncodingConverter> Converter = create(*From, *To);
+ErrorOr<EncodingConverter> EncodingConverter::create(StringRef From,
+                                                     StringRef To) {
+  std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
+  std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
+  if (FromEncoding && ToEncoding) {
+    ErrorOr<EncodingConverter> Converter = create(*FromEncoding, *ToEncoding);
     if (Converter)
       return Converter;
   }
 #if HAVE_ICU
   UErrorCode EC = U_ZERO_ERROR;
-  UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
+  UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
   if (U_FAILURE(EC)) {
     return std::error_code(errno, std::generic_category());
   }
-  UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC));
+  UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
   if (U_FAILURE(EC)) {
     return std::error_code(errno, std::generic_category());
   }
@@ -341,7 +339,7 @@ ErrorOr<EncodingConverter> EncodingConverter::create(StringRef CSFrom,
                                              std::move(ToConvDesc));
   return EncodingConverter(std::move(Converter));
 #elif HAVE_ICONV
-  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
   if (ConvDesc == (iconv_t)-1)
     return std::error_code(errno, std::generic_category());
   return EncodingConverter(std::make_unique<EncodingConverterIconv>(ConvDesc));
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 09e55f116f780..083c77a037d0f 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_unittest(SupportTests
   CrashRecoveryTest.cpp
   Caching.cpp
   Casting.cpp
-  CharSetTest.cpp
   CheckedArithmeticTest.cpp
   Chrono.cpp
   CommandLineTest.cpp
@@ -40,6 +39,7 @@ add_llvm_unittest(SupportTests
   ErrnoTest.cpp
   ErrorOrTest.cpp
   ErrorTest.cpp
+  EncodingConverterTest.cpp
   ExponentialBackoffTest.cpp
   ExtensibleRTTITest.cpp
   FileCollectorTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/EncodingConverterTest.cpp
similarity index 95%
rename from llvm/unittests/Support/CharSetTest.cpp
rename to llvm/unittests/Support/EncodingConverterTest.cpp
index 77e5b3064e1e9..9e6853a30d14d 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/EncodingConverterTest.cpp
@@ -1,4 +1,4 @@
-//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/CharSet.h"
+#include "llvm/Support/EncodingConverter.h"
 #include "llvm/ADT/SmallString.h"
 #include "gtest/gtest.h"
 using namespace llvm;
@@ -53,7 +53,7 @@ static const char EarthIBM939[] =
 static const char EarthUTFExtraPartial[] =
     "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
 
-TEST(CharSet, FromUTF8) {
+TEST(Encoding, FromUTF8) {
   // Hello string.
   StringRef Src(HelloA);
   SmallString<64> Dst;
@@ -93,7 +93,7 @@ TEST(CharSet, FromUTF8) {
   EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
 }
 
-TEST(CharSet, ToUTF8) {
+TEST(Encoding, ToUTF8) {
   // Hello string.
   StringRef Src(HelloE);
   SmallString<64> Dst;
@@ -128,7 +128,7 @@ TEST(CharSet, ToUTF8) {
   EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
 }
 
-TEST(CharSet, RoundTrip) {
+TEST(Encoding, RoundTrip) {
   ErrorOr<EncodingConverter> ConvToUTF16 =
       EncodingConverter::create("IBM-1047", "UTF-16");
   // Stop test if conversion is not supported (no underlying iconv support).
@@ -170,7 +170,7 @@ TEST(CharSet, RoundTrip) {
   EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
 }
 
-TEST(CharSet, ShiftState2022) {
+TEST(Encoding, ShiftState2022) {
   // Earth string.
   StringRef Src(EarthUTF);
   SmallString<8> Dst;
@@ -190,7 +190,7 @@ TEST(CharSet, ShiftState2022) {
   EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
 }
 
-TEST(CharSet, ShiftState2022Partial) {
+TEST(Encoding, InvalidInput) {
   // Earth string.
   StringRef Src(EarthUTFExtraPartial);
   SmallString<8> Dst;
@@ -204,12 +204,12 @@ TEST(CharSet, ShiftState2022Partial) {
     return;
   }
 
-  // Check that the string is properly converted.
+  // Check that the string failed to convert.
   std::error_code EC = ConvTo2022->convert(Src, Dst);
   EXPECT_TRUE(EC);
 }
 
-TEST(CharSet, ShiftStateIBM939) {
+TEST(Encoding, ShiftStateIBM939) {
   // Earth string.
   StringRef Src(EarthUTF);
   SmallString<64> Dst;

>From b32b472b6f6c01355bee447a98bcf453ac55e061 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 15 May 2025 09:08:40 -0400
Subject: [PATCH 5/5] rename filename, class to use TextEncoding, address
 comments

---
 llvm/CMakeLists.txt                           |  4 +-
 llvm/cmake/config-ix.cmake                    |  2 +-
 .../{EncodingConverter.h => TextEncoding.h}   | 37 +++++------
 llvm/lib/Support/CMakeLists.txt               |  2 +-
 ...EncodingConverter.cpp => TextEncoding.cpp} | 62 ++++++++++---------
 llvm/unittests/Support/CMakeLists.txt         |  2 +-
 ...ConverterTest.cpp => TextEncodingTest.cpp} | 34 +++++-----
 7 files changed, 74 insertions(+), 69 deletions(-)
 rename llvm/include/llvm/Support/{EncodingConverter.h => TextEncoding.h} (76%)
 rename llvm/lib/Support/{EncodingConverter.cpp => TextEncoding.cpp} (84%)
 rename llvm/unittests/Support/{EncodingConverterTest.cpp => TextEncodingTest.cpp} (88%)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 894c0e1d2e5ae..09f488fc45513 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -592,9 +592,9 @@ else()
   option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 endif()
 
-set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
 
-set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
 
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index f7e826b34d26f..9d59fea8799b1 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -314,7 +314,7 @@ if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
   set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
 endif()
 
-# Check for builtin iconv to avoid licensing issues.
+# Check only for builtin iconv to avoid licensing issues.
 if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
   if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
     find_package(Iconv REQUIRED)
diff --git a/llvm/include/llvm/Support/EncodingConverter.h b/llvm/include/llvm/Support/TextEncoding.h
similarity index 76%
rename from llvm/include/llvm/Support/EncodingConverter.h
rename to llvm/include/llvm/Support/TextEncoding.h
index 6ceb7f7f547de..fd457e5482bbe 100644
--- a/llvm/include/llvm/Support/EncodingConverter.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -1,4 +1,4 @@
-//===-- EncodingConverter.h - Encoding conversion class -----------*- C++ -*-=//
+//===-- TextEncodingConverter.h - Encoding conversion class -------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,7 +28,7 @@ namespace llvm {
 template <typename T> class SmallVectorImpl;
 
 namespace details {
-class EncodingConverterImplBase {
+class TextEncodingConverterImplBase {
 
 private:
   /// Converts a string.
@@ -57,7 +57,7 @@ class EncodingConverterImplBase {
   virtual void reset() = 0;
 
 public:
-  virtual ~EncodingConverterImplBase() = default;
+  virtual ~TextEncodingConverterImplBase() = default;
 
   /// Converts a string and resets the converter to the initial state.
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
@@ -78,43 +78,44 @@ enum class TextEncoding {
 };
 
 /// Utility class to convert between different character encodings.
-class EncodingConverter {
-  std::unique_ptr<details::EncodingConverterImplBase> Converter;
+class TextEncodingConverter {
+  std::unique_ptr<details::TextEncodingConverterImplBase> Converter;
 
-  EncodingConverter(
-      std::unique_ptr<details::EncodingConverterImplBase> Converter)
+  TextEncodingConverter(
+      std::unique_ptr<details::TextEncodingConverterImplBase> Converter)
       : Converter(std::move(Converter)) {}
 
 public:
-  /// Creates a EncodingConverter instance.
+  /// Creates a TextEncodingConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
   /// not supported.
   /// \param[in] From the source character encoding
   /// \param[in] To the target character encoding
-  /// \return a EncodingConverter instance or an error code
-  static ErrorOr<EncodingConverter> create(TextEncoding From, TextEncoding To);
+  /// \return a TextEncodingConverter instance or an error code
+  static ErrorOr<TextEncodingConverter> create(TextEncoding From,
+                                               TextEncoding To);
 
-  /// Creates a EncodingConverter instance.
+  /// Creates a TextEncodingConverter instance.
   /// Returns std::errc::invalid_argument in case the requested conversion is
   /// not supported.
   /// \param[in] From name of the source character encoding
   /// \param[in] To name of the target character encoding
-  /// \return a EncodingConverter instance or an error code
-  static ErrorOr<EncodingConverter> create(StringRef From, StringRef To);
+  /// \return a TextEncodingConverter instance or an error code
+  static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To);
 
-  EncodingConverter(const EncodingConverter &) = delete;
-  EncodingConverter &operator=(const EncodingConverter &) = delete;
+  TextEncodingConverter(const TextEncodingConverter &) = delete;
+  TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;
 
-  EncodingConverter(EncodingConverter &&Other)
+  TextEncodingConverter(TextEncodingConverter &&Other)
       : Converter(std::move(Other.Converter)) {}
 
-  EncodingConverter &operator=(EncodingConverter &&Other) {
+  TextEncodingConverter &operator=(TextEncodingConverter &&Other) {
     if (this != &Other)
       Converter = std::move(Other.Converter);
     return *this;
   }
 
-  ~EncodingConverter() = default;
+  ~TextEncodingConverter() = default;
 
   /// Converts a string.
   /// \param[in] Source source string
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 64c25148faa01..09e93f5a2ca7d 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -186,7 +186,6 @@ add_llvm_component_library(LLVMSupport
   ELFAttributes.cpp
   ELFAttrParserCompact.cpp
   ELFAttrParserExtended.cpp
-  EncodingConverter.cpp
   Error.cpp
   ErrorHandling.cpp
   ExponentialBackoff.cpp
@@ -258,6 +257,7 @@ add_llvm_component_library(LLVMSupport
   SuffixTree.cpp
   SystemUtils.cpp
   TarWriter.cpp
+  TextEncoding.cpp
   ThreadPool.cpp
   TimeProfiler.cpp
   Timer.cpp
diff --git a/llvm/lib/Support/EncodingConverter.cpp b/llvm/lib/Support/TextEncoding.cpp
similarity index 84%
rename from llvm/lib/Support/EncodingConverter.cpp
rename to llvm/lib/Support/TextEncoding.cpp
index 838fc89b6df95..6f02b6e2e6d43 100644
--- a/llvm/lib/Support/EncodingConverter.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -1,4 +1,4 @@
-//===-- EncodingConverter.cpp - Encoding conversion class ---------*- C++ -*-=//
+//===-- TextEncoding.cpp - Encoding conversion class --------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/EncodingConverter.h"
+#include "llvm/Support/TextEncoding.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -82,11 +82,12 @@ enum ConversionType {
 // aforementioned encodings. The use of tables for conversion is only
 // possible because EBCDIC 1047 is a single-byte, stateless encoding; other
 // encodings are not supported.
-class EncodingConverterTable : public details::EncodingConverterImplBase {
+class TextEncodingConverterTable
+    : public details::TextEncodingConverterImplBase {
   const ConversionType ConvType;
 
 public:
-  EncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+  TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
 
   std::error_code convertString(StringRef Source,
                                 SmallVectorImpl<char> &Result) override;
@@ -95,8 +96,8 @@ class EncodingConverterTable : public details::EncodingConverterImplBase {
 };
 
 std::error_code
-EncodingConverterTable::convertString(StringRef Source,
-                                      SmallVectorImpl<char> &Result) {
+TextEncodingConverterTable::convertString(StringRef Source,
+                                          SmallVectorImpl<char> &Result) {
   switch (ConvType) {
   case IBM1047ToUTF8:
     ConverterEBCDIC::convertToUTF8(Source, Result);
@@ -117,13 +118,13 @@ struct UConverterDeleter {
 };
 using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
 
-class EncodingConverterICU : public details::EncodingConverterImplBase {
+class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
   UConverterUniquePtr FromConvDesc;
   UConverterUniquePtr ToConvDesc;
 
 public:
-  EncodingConverterICU(UConverterUniquePtr FromConverter,
-                       UConverterUniquePtr ToConverter)
+  TextEncodingConverterICU(UConverterUniquePtr FromConverter,
+                           UConverterUniquePtr ToConverter)
       : FromConvDesc(std::move(FromConverter)),
         ToConvDesc(std::move(ToConverter)) {}
 
@@ -138,8 +139,8 @@ class EncodingConverterICU : public details::EncodingConverterImplBase {
 // insufficient buffer size. In the future, it would better to save the partial
 // result and redo the conversion for the remaining string.
 std::error_code
-EncodingConverterICU::convertString(StringRef Source,
-                                    SmallVectorImpl<char> &Result) {
+TextEncodingConverterICU::convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) {
   // Setup the input in case it has no backing data.
   size_t InputLength = Source.size();
   const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
@@ -183,13 +184,14 @@ EncodingConverterICU::convertString(StringRef Source,
   return std::error_code();
 }
 
-void EncodingConverterICU::reset() {
+void TextEncodingConverterICU::reset() {
   ucnv_reset(&*FromConvDesc);
   ucnv_reset(&*ToConvDesc);
 }
 
 #elif HAVE_ICONV
-class EncodingConverterIconv : public details::EncodingConverterImplBase {
+class TextEncodingConverterIconv
+    : public details::TextEncodingConverterImplBase {
   class UniqueIconvT {
     iconv_t ConvDesc;
 
@@ -216,7 +218,7 @@ class EncodingConverterIconv : public details::EncodingConverterImplBase {
   UniqueIconvT ConvDesc;
 
 public:
-  EncodingConverterIconv(UniqueIconvT ConvDesc)
+  TextEncodingConverterIconv(UniqueIconvT ConvDesc)
       : ConvDesc(std::move(ConvDesc)) {}
 
   std::error_code convertString(StringRef Source,
@@ -230,8 +232,8 @@ class EncodingConverterIconv : public details::EncodingConverterImplBase {
 // insufficient buffer size. In the future, it would better to save the partial
 // result and redo the conversion for the remaining string.
 std::error_code
-EncodingConverterIconv::convertString(StringRef Source,
-                                      SmallVectorImpl<char> &Result) {
+TextEncodingConverterIconv::convertString(StringRef Source,
+                                          SmallVectorImpl<char> &Result) {
   // Setup the output. We directly write into the SmallVector.
   size_t Capacity = Result.capacity();
   char *Output = static_cast<char *>(Result.data());
@@ -289,15 +291,15 @@ EncodingConverterIconv::convertString(StringRef Source,
   return std::error_code();
 }
 
-void EncodingConverterIconv::reset() {
+void TextEncodingConverterIconv::reset() {
   iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
 }
 
 #endif // HAVE_ICONV
 } // namespace
 
-ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
-                                                     TextEncoding CPTo) {
+ErrorOr<TextEncodingConverter>
+TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
 
   // Text encodings should be distinct.
   if (CPFrom == CPTo)
@@ -311,16 +313,17 @@ ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
   else
     return std::error_code(errno, std::generic_category());
 
-  return EncodingConverter(
-      std::make_unique<EncodingConverterTable>(Conversion));
+  return TextEncodingConverter(
+      std::make_unique<TextEncodingConverterTable>(Conversion));
 }
 
-ErrorOr<EncodingConverter> EncodingConverter::create(StringRef From,
-                                                     StringRef To) {
+ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
+                                                             StringRef To) {
   std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
   std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
   if (FromEncoding && ToEncoding) {
-    ErrorOr<EncodingConverter> Converter = create(*FromEncoding, *ToEncoding);
+    ErrorOr<TextEncodingConverter> Converter =
+        create(*FromEncoding, *ToEncoding);
     if (Converter)
       return Converter;
   }
@@ -334,15 +337,16 @@ ErrorOr<EncodingConverter> EncodingConverter::create(StringRef From,
   if (U_FAILURE(EC)) {
     return std::error_code(errno, std::generic_category());
   }
-  std::unique_ptr<details::EncodingConverterImplBase> Converter =
-      std::make_unique<EncodingConverterICU>(std::move(FromConvDesc),
-                                             std::move(ToConvDesc));
-  return EncodingConverter(std::move(Converter));
+  std::unique_ptr<details::TextEncodingConverterImplBase> Converter =
+      std::make_unique<TextEncodingConverterICU>(std::move(FromConvDesc),
+                                                 std::move(ToConvDesc));
+  return TextEncodingConverter(std::move(Converter));
 #elif HAVE_ICONV
   iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
   if (ConvDesc == (iconv_t)-1)
     return std::error_code(errno, std::generic_category());
-  return EncodingConverter(std::make_unique<EncodingConverterIconv>(ConvDesc));
+  return TextEncodingConverter(
+      std::make_unique<TextEncodingConverterIconv>(ConvDesc));
 #else
   return std::make_error_code(std::errc::invalid_argument);
 #endif
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 083c77a037d0f..d048e871fd0fb 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -39,7 +39,6 @@ add_llvm_unittest(SupportTests
   ErrnoTest.cpp
   ErrorOrTest.cpp
   ErrorTest.cpp
-  EncodingConverterTest.cpp
   ExponentialBackoffTest.cpp
   ExtensibleRTTITest.cpp
   FileCollectorTest.cpp
@@ -89,6 +88,7 @@ add_llvm_unittest(SupportTests
   SuffixTreeTest.cpp
   SwapByteOrderTest.cpp
   TarWriterTest.cpp
+  TextEncodingTest.cpp
   ThreadPool.cpp
   ThreadSafeAllocatorTest.cpp
   Threading.cpp
diff --git a/llvm/unittests/Support/EncodingConverterTest.cpp b/llvm/unittests/Support/TextEncodingTest.cpp
similarity index 88%
rename from llvm/unittests/Support/EncodingConverterTest.cpp
rename to llvm/unittests/Support/TextEncodingTest.cpp
index 9e6853a30d14d..383dff12c64e4 100644
--- a/llvm/unittests/Support/EncodingConverterTest.cpp
+++ b/llvm/unittests/Support/TextEncodingTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/EncodingConverter.h"
+#include "llvm/Support/TextEncoding.h"
 #include "llvm/ADT/SmallString.h"
 #include "gtest/gtest.h"
 using namespace llvm;
@@ -58,8 +58,8 @@ TEST(Encoding, FromUTF8) {
   StringRef Src(HelloA);
   SmallString<64> Dst;
 
-  ErrorOr<EncodingConverter> Conv =
-      EncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
+  ErrorOr<TextEncodingConverter> Conv =
+      TextEncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
 
   // Stop test if conversion is not supported.
   if (!Conv) {
@@ -98,8 +98,8 @@ TEST(Encoding, ToUTF8) {
   StringRef Src(HelloE);
   SmallString<64> Dst;
 
-  ErrorOr<EncodingConverter> Conv =
-      EncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
+  ErrorOr<TextEncodingConverter> Conv =
+      TextEncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
 
   // Stop test if conversion is not supported.
   if (!Conv) {
@@ -129,24 +129,24 @@ TEST(Encoding, ToUTF8) {
 }
 
 TEST(Encoding, RoundTrip) {
-  ErrorOr<EncodingConverter> ConvToUTF16 =
-      EncodingConverter::create("IBM-1047", "UTF-16");
+  ErrorOr<TextEncodingConverter> ConvToUTF16 =
+      TextEncodingConverter::create("IBM-1047", "UTF-16");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToUTF16) {
     ASSERT_EQ(ConvToUTF16.getError(),
               std::make_error_code(std::errc::invalid_argument));
     return;
   }
-  ErrorOr<EncodingConverter> ConvToUTF32 =
-      EncodingConverter::create("UTF-16", "UTF-32");
+  ErrorOr<TextEncodingConverter> ConvToUTF32 =
+      TextEncodingConverter::create("UTF-16", "UTF-32");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToUTF32) {
     ASSERT_EQ(ConvToUTF32.getError(),
               std::make_error_code(std::errc::invalid_argument));
     return;
   }
-  ErrorOr<EncodingConverter> ConvToEBCDIC =
-      EncodingConverter::create("UTF-32", "IBM-1047");
+  ErrorOr<TextEncodingConverter> ConvToEBCDIC =
+      TextEncodingConverter::create("UTF-32", "IBM-1047");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToEBCDIC) {
     ASSERT_EQ(ConvToEBCDIC.getError(),
@@ -175,8 +175,8 @@ TEST(Encoding, ShiftState2022) {
   StringRef Src(EarthUTF);
   SmallString<8> Dst;
 
-  ErrorOr<EncodingConverter> ConvTo2022 =
-      EncodingConverter::create("UTF-8", "ISO-2022-JP");
+  ErrorOr<TextEncodingConverter> ConvTo2022 =
+      TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvTo2022) {
     ASSERT_EQ(ConvTo2022.getError(),
@@ -195,8 +195,8 @@ TEST(Encoding, InvalidInput) {
   StringRef Src(EarthUTFExtraPartial);
   SmallString<8> Dst;
 
-  ErrorOr<EncodingConverter> ConvTo2022 =
-      EncodingConverter::create("UTF-8", "ISO-2022-JP");
+  ErrorOr<TextEncodingConverter> ConvTo2022 =
+      TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvTo2022) {
     ASSERT_EQ(ConvTo2022.getError(),
@@ -214,8 +214,8 @@ TEST(Encoding, ShiftStateIBM939) {
   StringRef Src(EarthUTF);
   SmallString<64> Dst;
 
-  ErrorOr<EncodingConverter> ConvToIBM939 =
-      EncodingConverter::create("UTF-8", "IBM-939");
+  ErrorOr<TextEncodingConverter> ConvToIBM939 =
+      TextEncodingConverter::create("UTF-8", "IBM-939");
   // Stop test if conversion is not supported (no underlying iconv support).
   if (!ConvToIBM939) {
     ASSERT_EQ(ConvToIBM939.getError(),