[llvm] Create a CharSetConverter class with both iconv and icu support (PR #74516)

Tue Mar 26 08:25:27 PDT 2024

https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/74516

>From 693f77c6aebe26f91bb75e556a612b3811c1cb15 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 5 Dec 2023 15:08:43 -0500
Subject: [PATCH 1/5] Create a CharSetConverter class with both iconv and icu
 support.

---
 llvm/cmake/config-ix.cmake              |  16 +
 llvm/include/llvm/Config/config.h.cmake |   6 +
 llvm/include/llvm/Support/CharSet.h     | 160 ++++++++++
 llvm/lib/Support/CMakeLists.txt         |  17 ++
 llvm/lib/Support/CharSet.cpp            | 370 ++++++++++++++++++++++++
 llvm/unittests/Support/CMakeLists.txt   |   1 +
 llvm/unittests/Support/CharSetTest.cpp  | 281 ++++++++++++++++++
 7 files changed, 851 insertions(+)
 create mode 100644 llvm/include/llvm/Support/CharSet.h
 create mode 100644 llvm/lib/Support/CharSet.cpp
 create mode 100644 llvm/unittests/Support/CharSetTest.cpp

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index bf1b110245bb2f..97a9a5816f596b 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -257,6 +257,22 @@ else()
   set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
+#Check for icu.
+find_package(ICU COMPONENTS uc i18n)
+if(ICU_FOUND)
+  set(HAVE_ICU 1)
+else()
+  set(HAVE_ICU 0)
+endif()
+
+# Check for iconv.
+find_package(Iconv)
+if(Iconv_FOUND)
+  set(HAVE_ICONV 1)
+else()
+  set(HAVE_ICONV 0)
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index fc1f9bf342f8d5..74003e1b22494e 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -281,6 +281,12 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
+/* Define if icu library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 00000000000000..856b3be65ff7ed
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,160 @@
+//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <functional>
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+public:
+  virtual ~CharSetConverterImplBase() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+
+  virtual std::error_code convert(StringRef Source,
+                                  SmallVectorImpl<char> &Result,
+                                  bool ShouldAutoFlush) const = 0;
+
+  /// Restore the conversion to the original state.
+  /// \return error code in case something went wrong
+  ///
+  /// If the original character set or the destination character set
+  /// are multi-byte character sets, set the shift state to the initial
+  /// state. Otherwise this is a no-op.
+  virtual std::error_code flush() const = 0;
+
+  virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+class CharSetConverter {
+  // details::CharSetConverterImplBase *Converter;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+  CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+      : Converter(std::move(Converter)) {}
+
+public:
+  /// Creates a CharSetConverter instance.
+  /// \param[in] CSFrom name of the source character encoding
+  /// \param[in] CSTo name of the target character encoding
+  /// \return a CharSetConverter instance
+  static CharSetConverter create(text_encoding::id CSFrom,
+                                 text_encoding::id CSTo);
+
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CPFrom name of the source character encoding
+  /// \param[in] CPTo name of the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+  CharSetConverter(const CharSetConverter &) = delete;
+  CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+  CharSetConverter(CharSetConverter &&Other) {
+    Converter = std::move(Other.Converter);
+  }
+
+  CharSetConverter &operator=(CharSetConverter &&Other) {
+    if (this != &Other)
+      Converter = std::move(Other.Converter);
+    return *this;
+  }
+
+  ~CharSetConverter() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings.
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return Converter->convert(Source, Result, ShouldAutoFlush);
+  }
+
+  char convert(char SingleChar) const {
+    SmallString<1> Result;
+    Converter->convert(StringRef(&SingleChar, 1), Result, false);
+    return Result[0];
+  }
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  std::error_code convert(const std::string &Source,
+                          SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return convert(StringRef(Source), Result, ShouldAutoFlush);
+  }
+
+  std::error_code flush() const { return Converter->flush(); }
+
+  std::error_code flush(SmallVectorImpl<char> &Result) const {
+    return Converter->flush(Result);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index e18beddf7bc5b7..3a8fe0e9e51b63 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -153,6 +153,7 @@ add_llvm_component_library(LLVMSupport
   CachePruning.cpp
   Caching.cpp
   circular_raw_ostream.cpp
+  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
@@ -295,6 +296,22 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
+# Link icu library if it is an external library.
+if(ICU_FOUND)
+  target_link_libraries(LLVMSupport
+  PRIVATE
+  ${ICU_LIBRARIES}
+  )
+else()
+  # Link iconv library if it is an external library.
+  if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+    target_link_libraries(LLVMSupport
+    PRIVATE
+    ${Iconv_LIBRARIES}
+    )
+  endif()
+endif()
+
 set(llvm_system_libs ${system_libs})
 
 # This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 00000000000000..dbc2cb7c1839d2
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,370 @@
+//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+#define CSNAME(CS, STR)                                                        \
+  if (Normalized.equals(STR))                                                  \
+  return CS
+  CSNAME(text_encoding::id::UTF8, "utf8");
+  CSNAME(text_encoding::id::IBM1047, "ibm1047");
+#undef CSNAME
+  return std::nullopt;
+}
+
+namespace {
+enum ConversionType {
+  UTFToIBM1047,
+  IBM1047ToUTF,
+};
+
+// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+  ConversionType ConvType;
+
+public:
+  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterTable::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  if (ConvType == IBM1047ToUTF) {
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  } else if (ConvType == UTFToIBM1047) {
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+std::error_code CharSetConverterTable::flush() const {
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#ifdef HAVE_ICU
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+  UConverter *FromConvDesc;
+  UConverter *ToConvDesc;
+
+public:
+  CharSetConverterICU(UConverter *Converter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC)) {
+      ToConvDesc = nullptr;
+    }
+  };
+
+  CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      ToConvDesc = nullptr;
+  }
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterICU::convert(StringRef Source,
+                                             SmallVectorImpl<char> &Result,
+                                             bool ShouldAutoFlush) const {
+  // Setup the output. We directly write into the SmallVector.
+  size_t OutputLength, Capacity = Result.capacity();
+  char *Output, *Out;
+
+  UErrorCode EC = U_ZERO_ERROR;
+
+  auto HandleError = [&Capacity, &Output, &OutputLength,
+                      &Result](UErrorCode UEC) {
+    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
+        Capacity < std::numeric_limits<size_t>::max()) {
+      // No space left in output buffer. Double the size of the underlying
+      // memory in the SmallVectorImpl, adjust pointer and length and continue
+      // the conversion.
+      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                     ? 2 * Capacity
+                     : std::numeric_limits<size_t>::max();
+      Result.resize_for_overwrite(Capacity);
+      Output = static_cast<char *>(Result.data());
+      OutputLength = Capacity;
+      return std::error_code();
+    } else {
+      // Some other error occured.
+      return std::error_code(errno, std::generic_category());
+    }
+  };
+
+  do {
+    EC = U_ZERO_ERROR;
+    size_t InputLength = Source.size();
+    const char *Input =
+        InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    const char *In = Input;
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    OutputLength = Capacity;
+    Out = Output;
+    Result.resize_for_overwrite(Capacity);
+    ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
+                   &Input, In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+    if (U_FAILURE(EC)) {
+      if (auto error = HandleError(EC))
+        return error;
+    } else if (U_SUCCESS(EC))
+      break;
+  } while (U_FAILURE(EC));
+
+  Result.resize(Output - Out);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
+
+std::error_code
+CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+  iconv_t ConvDesc;
+
+public:
+  CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  // Setup the input. Use nullptr to reset iconv state if input length is zero.
+  size_t InputLength = Source.size();
+  char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  size_t OutputLength = Capacity;
+
+  size_t Ret;
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Double the size of the underlying
+        // memory in the SmallVectorImpl, adjust pointer and length and continue
+        // the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                       ? 2 * Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  // Convert the string.
+  while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+  if (ShouldAutoFlush) {
+    while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+      if (auto EC = HandleError(Ret))
+        return EC;
+  }
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+  if (Ret == static_cast<size_t>(-1)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+  char *Output = Result.data();
+  size_t OutputLength = Result.capacity();
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Increase the size of the underlying
+        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+        // and continue the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+                       ? 2 + Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  size_t Ret;
+  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
+                                          text_encoding::id CPTo) {
+
+  assert(CPFrom != CPTo && "Text encodings should be distinct");
+
+  ConversionType Conversion;
+  if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+    Conversion = UTFToIBM1047;
+  else
+    Conversion = IBM1047ToUTF;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterTable>(Conversion);
+
+  return CharSetConverter(std::move(Converter));
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+                                                   StringRef CSTo) {
+  std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
+  std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+  if (From && To)
+    return create(*From, *To);
+#ifdef HAVE_ICU
+  UErrorCode EC = U_ZERO_ERROR;
+  UConverter *FromConvDesc = ucnv_open(CSFrom.str().c_str(), &EC);
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  UConverter *ToConvDesc = ucnv_open(CSTo.str().c_str(), &EC);
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterICU>(FromConvDesc, ToConvDesc);
+  return CharSetConverter(std::move(Converter));
+#elif defined(HAVE_ICONV)
+  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  if (ConvDesc == (iconv_t)-1)
+    return std::error_code(errno, std::generic_category());
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterIconv>(ConvDesc);
+  return CharSetConverter(std::move(Converter));
+#endif
+  return std::make_error_code(std::errc::invalid_argument);
+}
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 15a126279125c5..5f2d87293905a2 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(SupportTests
   BalancedPartitioningTest.cpp
   BranchProbabilityTest.cpp
   CachePruningTest.cpp
+  CharSetTest.cpp
   CrashRecoveryTest.cpp
   Casting.cpp
   CheckedArithmeticTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
new file mode 100644
index 00000000000000..2f2d8f97102b98
--- /dev/null
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,281 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+    "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+    "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+    "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+    "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+    "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+    "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+    "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+    "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+    "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+    "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+    "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+                              "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+// Identical to above, except the final character (球) has its last byte taken
+// away from it.
+static const char EarthISO2022[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+    "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, FromUTF8) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8,
+                                                   text_encoding::id::IBM1047);
+  std::error_code EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrA;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentUTF;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Cyrillic string. Results in error because not representable in 1047.
+  Src = CyrillicUTF;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(CharSet, ToUTF8) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047,
+                                                   text_encoding::id::UTF8);
+  std::error_code EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrE;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentE;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+  ErrorOr<CharSetConverter> ConvToUTF16 =
+      CharSetConverter::create("IBM-1047", "UTF-16");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToUTF32 =
+      CharSetConverter::create("UTF-16", "UTF-32");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToEBCDIC =
+      CharSetConverter::create("UTF-32", "IBM-1047");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvToIBM939 =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToIBM939) {
+    ASSERT_EQ(ConvToIBM939.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
+
+// Identical to EarthUTF, except the final character (球) has its last byte
+// taken away from it.
+static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
+static const char EarthISO2022ShiftBack[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
+static const char ShiftBackOnly[] = "\x1B\x28\x42";
+
+// String "地球".
+static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthKanjiOnlyISO2022[] =
+    "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
+static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, ShiftState2022Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo2022Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo939Flush =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  if (!ConvTo939Flush) {
+    ASSERT_EQ(ConvTo939Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo939Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush1) {
+  StringRef Src0(EarthUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
+  EXPECT_TRUE(!EC0);
+  EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
+  std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
+  EXPECT_TRUE(!EC1);
+  EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
+}
+
+#endif
+
+} // namespace

>From b7feb2529406c84db6e13b2e91c23e8b0025880e Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 9 Jan 2024 14:47:24 -0500
Subject: [PATCH 2/5] address review comments

---
 llvm/include/llvm/Support/CharSet.h |  38 ++++-----
 llvm/lib/Support/CharSet.cpp        | 116 ++++++++--------------------
 2 files changed, 49 insertions(+), 105 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 856b3be65ff7ed..fd077191c235b5 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -1,4 +1,4 @@
-//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -35,9 +35,9 @@ class CharSetConverterImplBase {
 
   /// Converts a string.
   /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
+  /// \param[out] Result container for converted string
   /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings iff true.
+  /// for stateful encodings if true.
   /// \return error code in case something went wrong
   ///
   /// The following error codes can occur, among others:
@@ -59,9 +59,9 @@ class CharSetConverterImplBase {
   /// Restore the conversion to the original state.
   /// \return error code in case something went wrong
   ///
-  /// If the original character set or the destination character set
-  /// are multi-byte character sets, set the shift state to the initial
-  /// state. Otherwise this is a no-op.
+  /// If the destination character set is a stateful character set,
+  /// set the shift state to the initial state.
+  /// Otherwise this is a no-op.
   virtual std::error_code flush() const = 0;
 
   virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
@@ -80,7 +80,6 @@ enum class id {
 } // end namespace text_encoding
 
 /// Utility class to convert between different character set encodings.
-/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
 class CharSetConverter {
   // details::CharSetConverterImplBase *Converter;
   std::unique_ptr<details::CharSetConverterImplBase> Converter;
@@ -121,33 +120,30 @@ class CharSetConverter {
 
   /// Converts a string.
   /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
+  /// \param[out] Result container for converted string
   /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings.
+  /// for stateful encodings.
   /// \return error code in case something went wrong
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush = true) const {
     return Converter->convert(Source, Result, ShouldAutoFlush);
   }
 
+  ErrorOr<std::string> convert(StringRef Source,
+                               bool ShouldAutoFlush = true) const {
+    SmallString<1> Result;
+    auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
+    if (!EC)
+      return std::string(Result);
+    return EC;
+  }
+
   char convert(char SingleChar) const {
     SmallString<1> Result;
     Converter->convert(StringRef(&SingleChar, 1), Result, false);
     return Result[0];
   }
 
-  /// Converts a string.
-  /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
-  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings iff true.
-  /// \return error code in case something went wrong
-  std::error_code convert(const std::string &Source,
-                          SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush = true) const {
-    return convert(StringRef(Source), Result, ShouldAutoFlush);
-  }
-
   std::error_code flush() const { return Converter->flush(); }
 
   std::error_code flush(SmallVectorImpl<char> &Result) const {
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index dbc2cb7c1839d2..1a49d665fdbda0 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -1,4 +1,4 @@
-//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -32,7 +32,8 @@ using namespace llvm;
 
 // Normalize the charset name with the charset alias matching algorithm proposed
 // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
-void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
   bool PrevDigit = false;
   for (auto Ch : CSName) {
     if (isAlnum(Ch)) {
@@ -49,15 +50,26 @@ void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
 std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
-#define CSNAME(CS, STR)                                                        \
-  if (Normalized.equals(STR))                                                  \
-  return CS
-  CSNAME(text_encoding::id::UTF8, "utf8");
-  CSNAME(text_encoding::id::IBM1047, "ibm1047");
-#undef CSNAME
+  if (Normalized.equals("utf8"))
+    return text_encoding::id::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return text_encoding::id::IBM1047;
   return std::nullopt;
 }
 
+void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+                    SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                 ? 2 * Capacity
+                 : std::numeric_limits<size_t>::max();
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
 namespace {
 enum ConversionType {
   UTFToIBM1047,
@@ -138,31 +150,12 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                                              SmallVectorImpl<char> &Result,
                                              bool ShouldAutoFlush) const {
   // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
   size_t OutputLength, Capacity = Result.capacity();
   char *Output, *Out;
 
   UErrorCode EC = U_ZERO_ERROR;
 
-  auto HandleError = [&Capacity, &Output, &OutputLength,
-                      &Result](UErrorCode UEC) {
-    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
-        Capacity < std::numeric_limits<size_t>::max()) {
-      // No space left in output buffer. Double the size of the underlying
-      // memory in the SmallVectorImpl, adjust pointer and length and continue
-      // the conversion.
-      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                     ? 2 * Capacity
-                     : std::numeric_limits<size_t>::max();
-      Result.resize_for_overwrite(Capacity);
-      Output = static_cast<char *>(Result.data());
-      OutputLength = Capacity;
-      return std::error_code();
-    } else {
-      // Some other error occured.
-      return std::error_code(errno, std::generic_category());
-    }
-  };
-
   do {
     EC = U_ZERO_ERROR;
     size_t InputLength = Source.size();
@@ -176,10 +169,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
-                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+                   /*pivotLimit=*/NULL, /*reset=*/true,
+                   /*flush=*/ShouldAutoFlush, &EC);
     if (U_FAILURE(EC)) {
-      if (auto error = HandleError(EC))
-        return error;
+      if (EC == U_BUFFER_OVERFLOW_ERROR &&
+          Capacity < std::numeric_limits<size_t>::max())
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+      else
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
     } else if (U_SUCCESS(EC))
       break;
   } while (U_FAILURE(EC));
@@ -215,8 +213,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   size_t InputLength = Source.size();
   char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
   // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
   size_t Capacity = Result.capacity();
-  Result.resize_for_overwrite(Capacity);
   char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
   size_t OutputLength = Capacity;
 
@@ -227,16 +225,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
     if (Ret == static_cast<size_t>(-1)) {
       // An error occured. Check if we can gracefully handle it.
       if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
-        // No space left in output buffer. Double the size of the underlying
-        // memory in the SmallVectorImpl, adjust pointer and length and continue
-        // the conversion.
-        const size_t Used = Capacity - OutputLength;
-        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                       ? 2 * Capacity
-                       : std::numeric_limits<size_t>::max();
-        Result.resize_for_overwrite(Capacity);
-        Output = static_cast<char *>(Result.data()) + Used;
-        OutputLength = Capacity - Used;
+        HandleOverflow(Capacity, Output, OutputLength, Result);
         return std::error_code();
       } else {
         // Some other error occured.
@@ -276,48 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
 
 std::error_code
 CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  char *Output = Result.data();
-  size_t OutputLength = Result.capacity();
-  size_t Capacity = Result.capacity();
-  Result.resize_for_overwrite(Capacity);
-
-  // Handle errors returned from iconv().
-  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
-    if (Ret == static_cast<size_t>(-1)) {
-      // An error occured. Check if we can gracefully handle it.
-      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
-        // No space left in output buffer. Increase the size of the underlying
-        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
-        // and continue the conversion.
-        const size_t Used = Capacity - OutputLength;
-        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
-                       ? 2 + Capacity
-                       : std::numeric_limits<size_t>::max();
-        Result.resize_for_overwrite(Capacity);
-        Output = static_cast<char *>(Result.data()) + Used;
-        OutputLength = Capacity - Used;
-        return std::error_code();
-      } else {
-        // Some other error occured.
-        return std::error_code(errno, std::generic_category());
-      }
-    } else {
-      // A positive return value indicates that some characters were converted
-      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
-      // an error in this case makes sure that both conversion routines behave
-      // in the same way.
-      return std::make_error_code(std::errc::illegal_byte_sequence);
-    }
-  };
-
-  size_t Ret;
-  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
-    if (auto EC = HandleError(Ret))
-      return EC;
-
-  // Re-adjust size to actual size.
-  Result.resize(Capacity - OutputLength);
-  return std::error_code();
+  return convert(nullptr, Result);
 }
 
 #endif // HAVE_ICONV

>From a80a1173e368d990c5730625f989fc65e1b463fd Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 10 Jan 2024 09:52:43 -0500
Subject: [PATCH 3/5] add LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV option

---
 llvm/CMakeLists.txt        |  4 ++++
 llvm/cmake/config-ix.cmake | 24 ++++++++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 6f5647d70d8bc1..6c06f509549f50 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -554,6 +554,10 @@ else()
   option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 endif()
 
+set(LLVM_ENABLE_ICU "ON" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "ON" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 97a9a5816f596b..4cf7188830ccba 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -258,19 +258,23 @@ else()
 endif()
 
 #Check for icu.
-find_package(ICU COMPONENTS uc i18n)
-if(ICU_FOUND)
-  set(HAVE_ICU 1)
-else()
-  set(HAVE_ICU 0)
+if(LLVM_ENABLE_ICU)
+  find_package(ICU COMPONENTS uc i18n)
+  if(ICU_FOUND)
+    set(HAVE_ICU 1)
+  else()
+    set(HAVE_ICU 0)
+  endif()
 endif()
 
 # Check for iconv.
-find_package(Iconv)
-if(Iconv_FOUND)
-  set(HAVE_ICONV 1)
-else()
-  set(HAVE_ICONV 0)
+if(LLVM_ENABLE_ICONV)
+  find_package(Iconv)
+  if(Iconv_FOUND)
+    set(HAVE_ICONV 1)
+  else()
+    set(HAVE_ICONV 0)
+  endif()
 endif()
 
 # function checks

>From 6d6412151155015f561311506b2a5f0afc4ef192 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 31 Jan 2024 14:06:50 -0500
Subject: [PATCH 4/5] remove single char conversion function

---
 llvm/include/llvm/Support/CharSet.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index fd077191c235b5..e573b3da9d7cc3 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -138,12 +138,6 @@ class CharSetConverter {
     return EC;
   }
 
-  char convert(char SingleChar) const {
-    SmallString<1> Result;
-    Converter->convert(StringRef(&SingleChar, 1), Result, false);
-    return Result[0];
-  }
-
   std::error_code flush() const { return Converter->flush(); }
 
   std::error_code flush(SmallVectorImpl<char> &Result) const {

>From 9d0371ab24c48208defcbe42801a826c78ebb298 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 23 Feb 2024 13:35:34 -0500
Subject: [PATCH 5/5] handle FORCE_ON, look for shared libraries only for ICU

---
 llvm/cmake/config-ix.cmake | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 4cf7188830ccba..32f265bbb953ef 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -259,22 +259,31 @@ endif()
 
 #Check for icu.
 if(LLVM_ENABLE_ICU)
-  find_package(ICU COMPONENTS uc i18n)
-  if(ICU_FOUND)
-    set(HAVE_ICU 1)
+  set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ".so")
+  if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+    find_package(ICU REQUIRED COMPONENTS uc i18n)
+    if (NOT ICU_FOUND)
+      message(FATAL_ERROR "Failed to configure icu, but LLVM_ENABLE_ICU is FORCE_ON")
+    endif()
   else()
-    set(HAVE_ICU 0)
+    find_package(ICU COMPONENTS uc i18n)
   endif()
+  set(HAVE_ICU ${ICU_FOUND})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
 endif()
 
 # Check for iconv.
 if(LLVM_ENABLE_ICONV)
-  find_package(Iconv)
-  if(Iconv_FOUND)
-    set(HAVE_ICONV 1)
+  if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+    find_package(Iconv REQUIRED)
+    if (NOT Iconv_FOUND)
+      message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+    endif()
   else()
-    set(HAVE_ICONV 0)
+    find_package(Iconv)
   endif()
+  set(HAVE_ICONV ${Iconv_FOUND})
 endif()
 
 # function checks