[llvm] Create a CharSetConverter class with both iconv and icu support (PR #74516)

Abhina Sree via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 29 06:52:29 PDT 2024


https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/74516

>From 05b67ca5cd2653bde2cb36dac2006284485ba59b Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 5 Dec 2023 15:08:43 -0500
Subject: [PATCH 01/14] Create a CharSetConverter class with both iconv and icu
 support.

---
 llvm/cmake/config-ix.cmake              |  16 +
 llvm/include/llvm/Config/config.h.cmake |   6 +
 llvm/include/llvm/Support/CharSet.h     | 160 ++++++++++
 llvm/lib/Support/CMakeLists.txt         |  17 ++
 llvm/lib/Support/CharSet.cpp            | 370 ++++++++++++++++++++++++
 llvm/unittests/Support/CMakeLists.txt   |   1 +
 llvm/unittests/Support/CharSetTest.cpp  | 281 ++++++++++++++++++
 7 files changed, 851 insertions(+)
 create mode 100644 llvm/include/llvm/Support/CharSet.h
 create mode 100644 llvm/lib/Support/CharSet.cpp
 create mode 100644 llvm/unittests/Support/CharSetTest.cpp

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index bf1b110245bb2f..97a9a5816f596b 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -257,6 +257,22 @@ else()
   set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
+#Check for icu.
+find_package(ICU COMPONENTS uc i18n)
+if(ICU_FOUND)
+  set(HAVE_ICU 1)
+else()
+  set(HAVE_ICU 0)
+endif()
+
+# Check for iconv.
+find_package(Iconv)
+if(Iconv_FOUND)
+  set(HAVE_ICONV 1)
+else()
+  set(HAVE_ICONV 0)
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 977c182e9d2b0d..ff169e4bedef2b 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -281,6 +281,12 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
+/* Define if icu library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 00000000000000..856b3be65ff7ed
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,160 @@
+//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <functional>
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+public:
+  virtual ~CharSetConverterImplBase() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+
+  virtual std::error_code convert(StringRef Source,
+                                  SmallVectorImpl<char> &Result,
+                                  bool ShouldAutoFlush) const = 0;
+
+  /// Restore the conversion to the original state.
+  /// \return error code in case something went wrong
+  ///
+  /// If the original character set or the destination character set
+  /// are multi-byte character sets, set the shift state to the initial
+  /// state. Otherwise this is a no-op.
+  virtual std::error_code flush() const = 0;
+
+  virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+class CharSetConverter {
+  // details::CharSetConverterImplBase *Converter;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+  CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+      : Converter(std::move(Converter)) {}
+
+public:
+  /// Creates a CharSetConverter instance.
+  /// \param[in] CSFrom name of the source character encoding
+  /// \param[in] CSTo name of the target character encoding
+  /// \return a CharSetConverter instance
+  static CharSetConverter create(text_encoding::id CSFrom,
+                                 text_encoding::id CSTo);
+
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CPFrom name of the source character encoding
+  /// \param[in] CPTo name of the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+  CharSetConverter(const CharSetConverter &) = delete;
+  CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+  CharSetConverter(CharSetConverter &&Other) {
+    Converter = std::move(Other.Converter);
+  }
+
+  CharSetConverter &operator=(CharSetConverter &&Other) {
+    if (this != &Other)
+      Converter = std::move(Other.Converter);
+    return *this;
+  }
+
+  ~CharSetConverter() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings.
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return Converter->convert(Source, Result, ShouldAutoFlush);
+  }
+
+  char convert(char SingleChar) const {
+    SmallString<1> Result;
+    Converter->convert(StringRef(&SingleChar, 1), Result, false);
+    return Result[0];
+  }
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  std::error_code convert(const std::string &Source,
+                          SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return convert(StringRef(Source), Result, ShouldAutoFlush);
+  }
+
+  std::error_code flush() const { return Converter->flush(); }
+
+  std::error_code flush(SmallVectorImpl<char> &Result) const {
+    return Converter->flush(Result);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 03e888958a0711..b0129ddaa882e6 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -153,6 +153,7 @@ add_llvm_component_library(LLVMSupport
   CachePruning.cpp
   Caching.cpp
   circular_raw_ostream.cpp
+  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
@@ -295,6 +296,22 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
+# Link icu library if it is an external library.
+if(ICU_FOUND)
+  target_link_libraries(LLVMSupport
+  PRIVATE
+  ${ICU_LIBRARIES}
+  )
+else()
+  # Link iconv library if it is an external library.
+  if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+    target_link_libraries(LLVMSupport
+    PRIVATE
+    ${Iconv_LIBRARIES}
+    )
+  endif()
+endif()
+
 set(llvm_system_libs ${system_libs})
 
 # This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 00000000000000..dbc2cb7c1839d2
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,370 @@
+//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+#define CSNAME(CS, STR)                                                        \
+  if (Normalized.equals(STR))                                                  \
+  return CS
+  CSNAME(text_encoding::id::UTF8, "utf8");
+  CSNAME(text_encoding::id::IBM1047, "ibm1047");
+#undef CSNAME
+  return std::nullopt;
+}
+
+namespace {
+enum ConversionType {
+  UTFToIBM1047,
+  IBM1047ToUTF,
+};
+
+// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+  ConversionType ConvType;
+
+public:
+  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterTable::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  if (ConvType == IBM1047ToUTF) {
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  } else if (ConvType == UTFToIBM1047) {
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+std::error_code CharSetConverterTable::flush() const {
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#ifdef HAVE_ICU
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+  UConverter *FromConvDesc;
+  UConverter *ToConvDesc;
+
+public:
+  CharSetConverterICU(UConverter *Converter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC)) {
+      ToConvDesc = nullptr;
+    }
+  };
+
+  CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      ToConvDesc = nullptr;
+  }
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterICU::convert(StringRef Source,
+                                             SmallVectorImpl<char> &Result,
+                                             bool ShouldAutoFlush) const {
+  // Setup the output. We directly write into the SmallVector.
+  size_t OutputLength, Capacity = Result.capacity();
+  char *Output, *Out;
+
+  UErrorCode EC = U_ZERO_ERROR;
+
+  auto HandleError = [&Capacity, &Output, &OutputLength,
+                      &Result](UErrorCode UEC) {
+    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
+        Capacity < std::numeric_limits<size_t>::max()) {
+      // No space left in output buffer. Double the size of the underlying
+      // memory in the SmallVectorImpl, adjust pointer and length and continue
+      // the conversion.
+      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                     ? 2 * Capacity
+                     : std::numeric_limits<size_t>::max();
+      Result.resize_for_overwrite(Capacity);
+      Output = static_cast<char *>(Result.data());
+      OutputLength = Capacity;
+      return std::error_code();
+    } else {
+      // Some other error occured.
+      return std::error_code(errno, std::generic_category());
+    }
+  };
+
+  do {
+    EC = U_ZERO_ERROR;
+    size_t InputLength = Source.size();
+    const char *Input =
+        InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    const char *In = Input;
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    OutputLength = Capacity;
+    Out = Output;
+    Result.resize_for_overwrite(Capacity);
+    ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
+                   &Input, In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+    if (U_FAILURE(EC)) {
+      if (auto error = HandleError(EC))
+        return error;
+    } else if (U_SUCCESS(EC))
+      break;
+  } while (U_FAILURE(EC));
+
+  Result.resize(Output - Out);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
+
+std::error_code
+CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+  iconv_t ConvDesc;
+
+public:
+  CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  // Setup the input. Use nullptr to reset iconv state if input length is zero.
+  size_t InputLength = Source.size();
+  char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  size_t OutputLength = Capacity;
+
+  size_t Ret;
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Double the size of the underlying
+        // memory in the SmallVectorImpl, adjust pointer and length and continue
+        // the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                       ? 2 * Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  // Convert the string.
+  while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+  if (ShouldAutoFlush) {
+    while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+      if (auto EC = HandleError(Ret))
+        return EC;
+  }
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+  if (Ret == static_cast<size_t>(-1)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+  char *Output = Result.data();
+  size_t OutputLength = Result.capacity();
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Increase the size of the underlying
+        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+        // and continue the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+                       ? 2 + Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  size_t Ret;
+  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
+                                          text_encoding::id CPTo) {
+
+  assert(CPFrom != CPTo && "Text encodings should be distinct");
+
+  ConversionType Conversion;
+  if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+    Conversion = UTFToIBM1047;
+  else
+    Conversion = IBM1047ToUTF;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterTable>(Conversion);
+
+  return CharSetConverter(std::move(Converter));
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+                                                   StringRef CSTo) {
+  std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
+  std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+  if (From && To)
+    return create(*From, *To);
+#ifdef HAVE_ICU
+  UErrorCode EC = U_ZERO_ERROR;
+  UConverter *FromConvDesc = ucnv_open(CSFrom.str().c_str(), &EC);
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  UConverter *ToConvDesc = ucnv_open(CSTo.str().c_str(), &EC);
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterICU>(FromConvDesc, ToConvDesc);
+  return CharSetConverter(std::move(Converter));
+#elif defined(HAVE_ICONV)
+  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  if (ConvDesc == (iconv_t)-1)
+    return std::error_code(errno, std::generic_category());
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterIconv>(ConvDesc);
+  return CharSetConverter(std::move(Converter));
+#endif
+  return std::make_error_code(std::errc::invalid_argument);
+}
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 2718be8450f805..21fd9862685ab6 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(SupportTests
   BalancedPartitioningTest.cpp
   BranchProbabilityTest.cpp
   CachePruningTest.cpp
+  CharSetTest.cpp
   CrashRecoveryTest.cpp
   Casting.cpp
   CheckedArithmeticTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
new file mode 100644
index 00000000000000..2f2d8f97102b98
--- /dev/null
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,281 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+    "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+    "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+    "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+    "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+    "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+    "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+    "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+    "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+    "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+    "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+    "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+                              "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+// Identical to above, except the final character (球) has its last byte taken
+// away from it.
+static const char EarthISO2022[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+    "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, FromUTF8) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8,
+                                                   text_encoding::id::IBM1047);
+  std::error_code EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrA;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentUTF;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Cyrillic string. Results in error because not representable in 1047.
+  Src = CyrillicUTF;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(CharSet, ToUTF8) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047,
+                                                   text_encoding::id::UTF8);
+  std::error_code EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrE;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentE;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+  ErrorOr<CharSetConverter> ConvToUTF16 =
+      CharSetConverter::create("IBM-1047", "UTF-16");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToUTF32 =
+      CharSetConverter::create("UTF-16", "UTF-32");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToEBCDIC =
+      CharSetConverter::create("UTF-32", "IBM-1047");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvToIBM939 =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToIBM939) {
+    ASSERT_EQ(ConvToIBM939.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
+
+// Identical to EarthUTF, except the final character (球) has its last byte
+// taken away from it.
+static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
+static const char EarthISO2022ShiftBack[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
+static const char ShiftBackOnly[] = "\x1B\x28\x42";
+
+// String "地球".
+static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthKanjiOnlyISO2022[] =
+    "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
+static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, ShiftState2022Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo2022Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo939Flush =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  if (!ConvTo939Flush) {
+    ASSERT_EQ(ConvTo939Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo939Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush1) {
+  StringRef Src0(EarthUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
+  EXPECT_TRUE(!EC0);
+  EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
+  std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
+  EXPECT_TRUE(!EC1);
+  EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
+}
+
+#endif
+
+} // namespace

>From b9740ea25aca702d01163e4347448830ceb937fc Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 9 Jan 2024 14:47:24 -0500
Subject: [PATCH 02/14] address review comments

---
 llvm/include/llvm/Support/CharSet.h |  38 ++++-----
 llvm/lib/Support/CharSet.cpp        | 116 ++++++++--------------------
 2 files changed, 49 insertions(+), 105 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 856b3be65ff7ed..fd077191c235b5 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -1,4 +1,4 @@
-//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -35,9 +35,9 @@ class CharSetConverterImplBase {
 
   /// Converts a string.
   /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
+  /// \param[out] Result container for converted string
   /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings iff true.
+  /// for stateful encodings if true.
   /// \return error code in case something went wrong
   ///
   /// The following error codes can occur, among others:
@@ -59,9 +59,9 @@ class CharSetConverterImplBase {
   /// Restore the conversion to the original state.
   /// \return error code in case something went wrong
   ///
-  /// If the original character set or the destination character set
-  /// are multi-byte character sets, set the shift state to the initial
-  /// state. Otherwise this is a no-op.
+  /// If the destination character set is a stateful character set,
+  /// set the shift state to the initial state.
+  /// Otherwise this is a no-op.
   virtual std::error_code flush() const = 0;
 
   virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
@@ -80,7 +80,6 @@ enum class id {
 } // end namespace text_encoding
 
 /// Utility class to convert between different character set encodings.
-/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
 class CharSetConverter {
   // details::CharSetConverterImplBase *Converter;
   std::unique_ptr<details::CharSetConverterImplBase> Converter;
@@ -121,33 +120,30 @@ class CharSetConverter {
 
   /// Converts a string.
   /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
+  /// \param[out] Result container for converted string
   /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings.
+  /// for stateful encodings.
   /// \return error code in case something went wrong
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush = true) const {
     return Converter->convert(Source, Result, ShouldAutoFlush);
   }
 
+  ErrorOr<std::string> convert(StringRef Source,
+                               bool ShouldAutoFlush = true) const {
+    SmallString<1> Result;
+    auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
+    if (!EC)
+      return std::string(Result);
+    return EC;
+  }
+
   char convert(char SingleChar) const {
     SmallString<1> Result;
     Converter->convert(StringRef(&SingleChar, 1), Result, false);
     return Result[0];
   }
 
-  /// Converts a string.
-  /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
-  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings iff true.
-  /// \return error code in case something went wrong
-  std::error_code convert(const std::string &Source,
-                          SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush = true) const {
-    return convert(StringRef(Source), Result, ShouldAutoFlush);
-  }
-
   std::error_code flush() const { return Converter->flush(); }
 
   std::error_code flush(SmallVectorImpl<char> &Result) const {
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index dbc2cb7c1839d2..1a49d665fdbda0 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -1,4 +1,4 @@
-//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -32,7 +32,8 @@ using namespace llvm;
 
 // Normalize the charset name with the charset alias matching algorithm proposed
 // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
-void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
   bool PrevDigit = false;
   for (auto Ch : CSName) {
     if (isAlnum(Ch)) {
@@ -49,15 +50,26 @@ void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
 std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
-#define CSNAME(CS, STR)                                                        \
-  if (Normalized.equals(STR))                                                  \
-  return CS
-  CSNAME(text_encoding::id::UTF8, "utf8");
-  CSNAME(text_encoding::id::IBM1047, "ibm1047");
-#undef CSNAME
+  if (Normalized.equals("utf8"))
+    return text_encoding::id::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return text_encoding::id::IBM1047;
   return std::nullopt;
 }
 
+void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+                    SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                 ? 2 * Capacity
+                 : std::numeric_limits<size_t>::max();
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
 namespace {
 enum ConversionType {
   UTFToIBM1047,
@@ -138,31 +150,12 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                                              SmallVectorImpl<char> &Result,
                                              bool ShouldAutoFlush) const {
   // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
   size_t OutputLength, Capacity = Result.capacity();
   char *Output, *Out;
 
   UErrorCode EC = U_ZERO_ERROR;
 
-  auto HandleError = [&Capacity, &Output, &OutputLength,
-                      &Result](UErrorCode UEC) {
-    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
-        Capacity < std::numeric_limits<size_t>::max()) {
-      // No space left in output buffer. Double the size of the underlying
-      // memory in the SmallVectorImpl, adjust pointer and length and continue
-      // the conversion.
-      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                     ? 2 * Capacity
-                     : std::numeric_limits<size_t>::max();
-      Result.resize_for_overwrite(Capacity);
-      Output = static_cast<char *>(Result.data());
-      OutputLength = Capacity;
-      return std::error_code();
-    } else {
-      // Some other error occured.
-      return std::error_code(errno, std::generic_category());
-    }
-  };
-
   do {
     EC = U_ZERO_ERROR;
     size_t InputLength = Source.size();
@@ -176,10 +169,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
-                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+                   /*pivotLimit=*/NULL, /*reset=*/true,
+                   /*flush=*/ShouldAutoFlush, &EC);
     if (U_FAILURE(EC)) {
-      if (auto error = HandleError(EC))
-        return error;
+      if (EC == U_BUFFER_OVERFLOW_ERROR &&
+          Capacity < std::numeric_limits<size_t>::max())
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+      else
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
     } else if (U_SUCCESS(EC))
       break;
   } while (U_FAILURE(EC));
@@ -215,8 +213,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   size_t InputLength = Source.size();
   char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
   // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
   size_t Capacity = Result.capacity();
-  Result.resize_for_overwrite(Capacity);
   char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
   size_t OutputLength = Capacity;
 
@@ -227,16 +225,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
     if (Ret == static_cast<size_t>(-1)) {
       // An error occured. Check if we can gracefully handle it.
       if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
-        // No space left in output buffer. Double the size of the underlying
-        // memory in the SmallVectorImpl, adjust pointer and length and continue
-        // the conversion.
-        const size_t Used = Capacity - OutputLength;
-        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                       ? 2 * Capacity
-                       : std::numeric_limits<size_t>::max();
-        Result.resize_for_overwrite(Capacity);
-        Output = static_cast<char *>(Result.data()) + Used;
-        OutputLength = Capacity - Used;
+        HandleOverflow(Capacity, Output, OutputLength, Result);
         return std::error_code();
       } else {
         // Some other error occured.
@@ -276,48 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
 
 std::error_code
 CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  char *Output = Result.data();
-  size_t OutputLength = Result.capacity();
-  size_t Capacity = Result.capacity();
-  Result.resize_for_overwrite(Capacity);
-
-  // Handle errors returned from iconv().
-  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
-    if (Ret == static_cast<size_t>(-1)) {
-      // An error occured. Check if we can gracefully handle it.
-      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
-        // No space left in output buffer. Increase the size of the underlying
-        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
-        // and continue the conversion.
-        const size_t Used = Capacity - OutputLength;
-        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
-                       ? 2 + Capacity
-                       : std::numeric_limits<size_t>::max();
-        Result.resize_for_overwrite(Capacity);
-        Output = static_cast<char *>(Result.data()) + Used;
-        OutputLength = Capacity - Used;
-        return std::error_code();
-      } else {
-        // Some other error occured.
-        return std::error_code(errno, std::generic_category());
-      }
-    } else {
-      // A positive return value indicates that some characters were converted
-      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
-      // an error in this case makes sure that both conversion routines behave
-      // in the same way.
-      return std::make_error_code(std::errc::illegal_byte_sequence);
-    }
-  };
-
-  size_t Ret;
-  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
-    if (auto EC = HandleError(Ret))
-      return EC;
-
-  // Re-adjust size to actual size.
-  Result.resize(Capacity - OutputLength);
-  return std::error_code();
+  return convert(nullptr, Result);
 }
 
 #endif // HAVE_ICONV

>From 3650a690e22d3bcc69482ac69b62c43adc978c84 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 10 Jan 2024 09:52:43 -0500
Subject: [PATCH 03/14] add LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV option

---
 llvm/CMakeLists.txt        |  4 ++++
 llvm/cmake/config-ix.cmake | 24 ++++++++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 43181af3bc1953..110069c7a764de 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -554,6 +554,10 @@ else()
   option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 endif()
 
+set(LLVM_ENABLE_ICU "ON" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "ON" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 97a9a5816f596b..4cf7188830ccba 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -258,19 +258,23 @@ else()
 endif()
 
 #Check for icu.
-find_package(ICU COMPONENTS uc i18n)
-if(ICU_FOUND)
-  set(HAVE_ICU 1)
-else()
-  set(HAVE_ICU 0)
+if(LLVM_ENABLE_ICU)
+  find_package(ICU COMPONENTS uc i18n)
+  if(ICU_FOUND)
+    set(HAVE_ICU 1)
+  else()
+    set(HAVE_ICU 0)
+  endif()
 endif()
 
 # Check for iconv.
-find_package(Iconv)
-if(Iconv_FOUND)
-  set(HAVE_ICONV 1)
-else()
-  set(HAVE_ICONV 0)
+if(LLVM_ENABLE_ICONV)
+  find_package(Iconv)
+  if(Iconv_FOUND)
+    set(HAVE_ICONV 1)
+  else()
+    set(HAVE_ICONV 0)
+  endif()
 endif()
 
 # function checks

>From c8ed744483e7e137efd4a03ffa986f2766d71bbf Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 31 Jan 2024 14:06:50 -0500
Subject: [PATCH 04/14] remove single char conversion function

---
 llvm/include/llvm/Support/CharSet.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index fd077191c235b5..e573b3da9d7cc3 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -138,12 +138,6 @@ class CharSetConverter {
     return EC;
   }
 
-  char convert(char SingleChar) const {
-    SmallString<1> Result;
-    Converter->convert(StringRef(&SingleChar, 1), Result, false);
-    return Result[0];
-  }
-
   std::error_code flush() const { return Converter->flush(); }
 
   std::error_code flush(SmallVectorImpl<char> &Result) const {

>From 6bd8940a7b28ff34bb0ecc4e98a4bfecbb40c76f Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 23 Feb 2024 13:35:34 -0500
Subject: [PATCH 05/14] handle FORCE_ON, look for shared libraries only for ICU

---
 llvm/cmake/config-ix.cmake | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 4cf7188830ccba..32f265bbb953ef 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -259,22 +259,31 @@ endif()
 
 #Check for icu.
 if(LLVM_ENABLE_ICU)
-  find_package(ICU COMPONENTS uc i18n)
-  if(ICU_FOUND)
-    set(HAVE_ICU 1)
+  set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ".so")
+  if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+    find_package(ICU REQUIRED COMPONENTS uc i18n)
+    if (NOT ICU_FOUND)
+      message(FATAL_ERROR "Failed to configure icu, but LLVM_ENABLE_ICU is FORCE_ON")
+    endif()
   else()
-    set(HAVE_ICU 0)
+    find_package(ICU COMPONENTS uc i18n)
   endif()
+  set(HAVE_ICU ${ICU_FOUND})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
 endif()
 
 # Check for iconv.
 if(LLVM_ENABLE_ICONV)
-  find_package(Iconv)
-  if(Iconv_FOUND)
-    set(HAVE_ICONV 1)
+  if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+    find_package(Iconv REQUIRED)
+    if (NOT Iconv_FOUND)
+      message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+    endif()
   else()
-    set(HAVE_ICONV 0)
+    find_package(Iconv)
   endif()
+  set(HAVE_ICONV ${Iconv_FOUND})
 endif()
 
 # function checks

>From ad40825cf9bd2a6f023adebcb86da9a7083b792a Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 4 Apr 2024 13:45:13 -0400
Subject: [PATCH 06/14] only allow builtin iconv support

---
 llvm/cmake/config-ix.cmake      | 4 +++-
 llvm/lib/Support/CMakeLists.txt | 8 --------
 llvm/lib/Support/CharSet.cpp    | 4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 32f265bbb953ef..422938a3e1cc33 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -283,7 +283,9 @@ if(LLVM_ENABLE_ICONV)
   else()
     find_package(Iconv)
   endif()
-  set(HAVE_ICONV ${Iconv_FOUND})
+  if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
+    set(HAVE_ICONV 1)
+  endif()
 endif()
 
 # function checks
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index b0129ddaa882e6..fe46b69e42290e 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -302,14 +302,6 @@ if(ICU_FOUND)
   PRIVATE
   ${ICU_LIBRARIES}
   )
-else()
-  # Link iconv library if it is an external library.
-  if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
-    target_link_libraries(LLVMSupport
-    PRIVATE
-    ${Iconv_LIBRARIES}
-    )
-  endif()
 endif()
 
 set(llvm_system_libs ${system_libs})
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 1a49d665fdbda0..330f420c452232 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -215,7 +215,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   // Setup the output. We directly write into the SmallVector.
   Result.resize_for_overwrite(Source.size());
   size_t Capacity = Result.capacity();
-  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  char *Output = static_cast<char *>(Result.data());
   size_t OutputLength = Capacity;
 
   size_t Ret;
@@ -265,7 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
 
 std::error_code
 CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  return convert(nullptr, Result);
+  return convert("", Result, /*ShouldAutoFlush=*/true);
 }
 
 #endif // HAVE_ICONV

>From 0dda2bb1795be418ab6d6142f08568681aa6a76c Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree at users.noreply.github.com>
Date: Wed, 17 Apr 2024 08:33:03 -0400
Subject: [PATCH 07/14] Update llvm/cmake/config-ix.cmake

Co-authored-by: Eli Friedman <efriedma at quicinc.com>
---
 llvm/cmake/config-ix.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 422938a3e1cc33..43df3851fc2188 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -277,7 +277,7 @@ endif()
 if(LLVM_ENABLE_ICONV)
   if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
     find_package(Iconv REQUIRED)
-    if (NOT Iconv_FOUND)
+    if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
       message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
     endif()
   else()

>From 886cbb7a6813294e7fb1ffd81bf63f00998b1805 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 18 Apr 2024 13:31:20 -0400
Subject: [PATCH 08/14] add ICU license

---
 llvm/lib/Support/CharSet.cpp | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 330f420c452232..83d4d9243f2e4a 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -12,6 +12,46 @@
 ///
 //===----------------------------------------------------------------------===//
 
+// UNICODE LICENSE V3
+//
+// COPYRIGHT AND PERMISSION NOTICE
+//
+// Copyright © 2016-2024 Unicode, Inc.
+//
+// NOTICE TO USER: Carefully read the following legal agreement. BY
+// DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+// SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+// TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+// DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of data files and any associated documentation (the "Data Files") or
+// software and any associated documentation (the "Software") to deal in the
+// Data Files or Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, and/or sell
+// copies of the Data Files or Software, and to permit persons to whom the
+// Data Files or Software are furnished to do so, provided that either (a)
+// this copyright and permission notice appear with all copies of the Data
+// Files or Software, or (b) this copyright and permission notice appear in
+// associated Documentation.
+//
+// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+// KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+// THIRD PARTY RIGHTS.
+//
+// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+// BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+// FILES OR SOFTWARE.
+
+// Except as contained in this notice, the name of a copyright holder shall
+// not be used in advertising or otherwise to promote the sale, use or other
+// dealings in these Data Files or Software without prior written
+// authorization of the copyright holder.
+
 #include "llvm/Support/CharSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"

>From 7ca4b94f1426c3757a9f37bffaee2a7fa91e0c56 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 19 Apr 2024 11:53:20 -0400
Subject: [PATCH 09/14] Revert "add ICU license"

This reverts commit 1f228eff515c6200d1f6ab70471223fc7d78de69.
---
 llvm/lib/Support/CharSet.cpp | 40 ------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 83d4d9243f2e4a..330f420c452232 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -12,46 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-// UNICODE LICENSE V3
-//
-// COPYRIGHT AND PERMISSION NOTICE
-//
-// Copyright © 2016-2024 Unicode, Inc.
-//
-// NOTICE TO USER: Carefully read the following legal agreement. BY
-// DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
-// SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
-// TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
-// DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of data files and any associated documentation (the "Data Files") or
-// software and any associated documentation (the "Software") to deal in the
-// Data Files or Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, and/or sell
-// copies of the Data Files or Software, and to permit persons to whom the
-// Data Files or Software are furnished to do so, provided that either (a)
-// this copyright and permission notice appear with all copies of the Data
-// Files or Software, or (b) this copyright and permission notice appear in
-// associated Documentation.
-//
-// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
-// KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
-// THIRD PARTY RIGHTS.
-//
-// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
-// BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
-// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
-// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
-// FILES OR SOFTWARE.
-
-// Except as contained in this notice, the name of a copyright holder shall
-// not be used in advertising or otherwise to promote the sale, use or other
-// dealings in these Data Files or Software without prior written
-// authorization of the copyright holder.
-
 #include "llvm/Support/CharSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"

>From 6e7df09f8a7b528fb8fe16ecff68f1bb14514bf2 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 19 Apr 2024 11:55:35 -0400
Subject: [PATCH 10/14] address comments

---
 llvm/include/llvm/Support/CharSet.h |  2 +-
 llvm/lib/Support/CharSet.cpp        | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index e573b3da9d7cc3..b2c50cd423d6da 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -131,7 +131,7 @@ class CharSetConverter {
 
   ErrorOr<std::string> convert(StringRef Source,
                                bool ShouldAutoFlush = true) const {
-    SmallString<1> Result;
+    SmallString<100> Result;
     auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
     if (!EC)
       return std::string(Result);
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 330f420c452232..55eb5f3f1ad6d6 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -47,7 +47,7 @@ static void normalizeCharSetName(StringRef CSName,
 }
 
 // Maps the charset name to enum constant if possible.
-std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
   if (Normalized.equals("utf8"))
@@ -57,8 +57,9 @@ std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   return std::nullopt;
 }
 
-void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
-                    SmallVectorImpl<char> &Result) {
+static void HandleOverflow(size_t &Capacity, char *&Output,
+                           size_t &OutputLength,
+                           SmallVectorImpl<char> &Result) {
   // No space left in output buffer. Double the size of the underlying
   // memory in the SmallVectorImpl, adjust pointer and length and continue
   // the conversion.
@@ -150,9 +151,10 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                                              SmallVectorImpl<char> &Result,
                                              bool ShouldAutoFlush) const {
   // Setup the output. We directly write into the SmallVector.
-  Result.resize_for_overwrite(Source.size());
-  size_t OutputLength, Capacity = Result.capacity();
+  size_t Capacity = Result.capacity();
+  size_t OutputLength = Capacity;
   char *Output, *Out;
+  Result.resize_for_overwrite(Capacity);
 
   UErrorCode EC = U_ZERO_ERROR;
 
@@ -163,9 +165,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
         InputLength ? const_cast<char *>(Source.data()) : nullptr;
     const char *In = Input;
     Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
-    OutputLength = Capacity;
     Out = Output;
-    Result.resize_for_overwrite(Capacity);
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
@@ -177,7 +177,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
         HandleOverflow(Capacity, Output, OutputLength, Result);
       else
         // Some other error occured.
-        return std::error_code(errno, std::generic_category());
+        return std::error_code(EILSEQ, std::generic_category());
     } else if (U_SUCCESS(EC))
       break;
   } while (U_FAILURE(EC));

>From 51bfc0725ef16917898824b558a4e5bc49a97270 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 22 Apr 2024 13:27:47 -0400
Subject: [PATCH 11/14] remove function to get shift back characters, address
 comments

---
 llvm/include/llvm/Support/CharSet.h    |  6 --
 llvm/lib/Support/CharSet.cpp           | 31 +++-------
 llvm/unittests/Support/CharSetTest.cpp | 85 --------------------------
 3 files changed, 7 insertions(+), 115 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index b2c50cd423d6da..b3bc138518b1a0 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -63,8 +63,6 @@ class CharSetConverterImplBase {
   /// set the shift state to the initial state.
   /// Otherwise this is a no-op.
   virtual std::error_code flush() const = 0;
-
-  virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
 };
 } // namespace details
 
@@ -139,10 +137,6 @@ class CharSetConverter {
   }
 
   std::error_code flush() const { return Converter->flush(); }
-
-  std::error_code flush(SmallVectorImpl<char> &Result) const {
-    return Converter->flush(Result);
-  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 55eb5f3f1ad6d6..73cd34b535b106 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -91,7 +91,6 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
   std::error_code flush() const override;
-  std::error_code flush(SmallVectorImpl<char> &Result) const override;
 };
 
 std::error_code CharSetConverterTable::convert(StringRef Source,
@@ -111,11 +110,6 @@ std::error_code CharSetConverterTable::flush() const {
   return std::error_code();
 }
 
-std::error_code
-CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
-  return std::error_code();
-}
-
 #ifdef HAVE_ICU
 class CharSetConverterICU : public details::CharSetConverterImplBase {
   UConverter *FromConvDesc;
@@ -144,7 +138,6 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
   std::error_code flush() const override;
-  std::error_code flush(SmallVectorImpl<char> &Result) const override;
 };
 
 std::error_code CharSetConverterICU::convert(StringRef Source,
@@ -164,7 +157,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
     const char *Input =
         InputLength ? const_cast<char *>(Source.data()) : nullptr;
     const char *In = Input;
-    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    Output = static_cast<char *>(Result.data());
     Out = Output;
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
@@ -173,14 +166,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                    /*flush=*/ShouldAutoFlush, &EC);
     if (U_FAILURE(EC)) {
       if (EC == U_BUFFER_OVERFLOW_ERROR &&
-          Capacity < std::numeric_limits<size_t>::max())
+          Capacity < std::numeric_limits<size_t>::max()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
-      else
+        continue;
+      } else
         // Some other error occured.
         return std::error_code(EILSEQ, std::generic_category());
-    } else if (U_SUCCESS(EC))
-      break;
-  } while (U_FAILURE(EC));
+    }
+    break;
+  } while (true);
 
   Result.resize(Output - Out);
   return std::error_code();
@@ -188,11 +182,6 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
 
 std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
 
-std::error_code
-CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
-  return std::error_code();
-}
-
 #elif defined(HAVE_ICONV)
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
   iconv_t ConvDesc;
@@ -203,7 +192,6 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
   std::error_code flush() const override;
-  std::error_code flush(SmallVectorImpl<char> &Result) const override;
 };
 
 std::error_code CharSetConverterIconv::convert(StringRef Source,
@@ -263,11 +251,6 @@ std::error_code CharSetConverterIconv::flush() const {
   return std::error_code();
 }
 
-std::error_code
-CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  return convert("", Result, /*ShouldAutoFlush=*/true);
-}
-
 #endif // HAVE_ICONV
 } // namespace
 
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 2f2d8f97102b98..4628a44ef7fff2 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -193,89 +193,4 @@ TEST(CharSet, ShiftStateIBM939) {
   EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
 }
 
-#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
-
-// Identical to EarthUTF, except the final character (球) has its last byte
-// taken away from it.
-static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
-static const char EarthISO2022ShiftBack[] =
-    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
-static const char ShiftBackOnly[] = "\x1B\x28\x42";
-
-// String "地球".
-static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
-static const char EarthKanjiOnlyISO2022[] =
-    "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
-static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
-
-TEST(CharSet, ShiftState2022Flush) {
-  StringRef Src0(EarthUTFBroken);
-  StringRef Src1(EarthKanjiOnlyUTF);
-  SmallString<64> Dst0;
-  SmallString<64> Dst1;
-  ErrorOr<CharSetConverter> ConvTo2022Flush =
-      CharSetConverter::create("UTF-8", "ISO-2022-JP");
-  if (!ConvTo2022Flush) {
-    ASSERT_EQ(ConvTo2022Flush.getError(),
-              std::make_error_code(std::errc::invalid_argument));
-    return;
-  }
-
-  // This should emit an error; there is a malformed multibyte character in the
-  // input string.
-  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
-  EXPECT_TRUE(EC0);
-  std::error_code EC1 = ConvTo2022Flush->flush();
-  EXPECT_TRUE(!EC1);
-  std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
-  EXPECT_TRUE(!EC2);
-  EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
-}
-
-TEST(CharSet, ShiftStateIBM939Flush) {
-  StringRef Src0(EarthUTFBroken);
-  StringRef Src1(EarthKanjiOnlyUTF);
-  SmallString<64> Dst0;
-  SmallString<64> Dst1;
-  ErrorOr<CharSetConverter> ConvTo939Flush =
-      CharSetConverter::create("UTF-8", "IBM-939");
-  if (!ConvTo939Flush) {
-    ASSERT_EQ(ConvTo939Flush.getError(),
-              std::make_error_code(std::errc::invalid_argument));
-    return;
-  }
-
-  // This should emit an error; there is a malformed multibyte character in the
-  // input string.
-  std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
-  EXPECT_TRUE(EC0);
-  std::error_code EC1 = ConvTo939Flush->flush();
-  EXPECT_TRUE(!EC1);
-  std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
-  EXPECT_TRUE(!EC2);
-  EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
-}
-
-TEST(CharSet, ShiftState2022Flush1) {
-  StringRef Src0(EarthUTF);
-  SmallString<64> Dst0;
-  SmallString<64> Dst1;
-  ErrorOr<CharSetConverter> ConvTo2022Flush =
-      CharSetConverter::create("UTF-8", "ISO-2022-JP");
-  if (!ConvTo2022Flush) {
-    ASSERT_EQ(ConvTo2022Flush.getError(),
-              std::make_error_code(std::errc::invalid_argument));
-    return;
-  }
-
-  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
-  EXPECT_TRUE(!EC0);
-  EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
-  std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
-  EXPECT_TRUE(!EC1);
-  EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
-}
-
-#endif
-
 } // namespace

>From aaa8a55245d8dd23ecfdb91ae68093cffc9b422e Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 23 Apr 2024 08:09:16 -0400
Subject: [PATCH 12/14] remove other flush function as well

---
 llvm/include/llvm/Support/CharSet.h |  3 ---
 llvm/lib/Support/CharSet.cpp        | 17 -----------------
 2 files changed, 20 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index b3bc138518b1a0..1500ccae0a24b6 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -62,7 +62,6 @@ class CharSetConverterImplBase {
   /// If the destination character set is a stateful character set,
   /// set the shift state to the initial state.
   /// Otherwise this is a no-op.
-  virtual std::error_code flush() const = 0;
 };
 } // namespace details
 
@@ -135,8 +134,6 @@ class CharSetConverter {
       return std::string(Result);
     return EC;
   }
-
-  std::error_code flush() const { return Converter->flush(); }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 73cd34b535b106..52f00b736af451 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -90,7 +90,6 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
-  std::error_code flush() const override;
 };
 
 std::error_code CharSetConverterTable::convert(StringRef Source,
@@ -106,10 +105,6 @@ std::error_code CharSetConverterTable::convert(StringRef Source,
   return std::error_code();
 }
 
-std::error_code CharSetConverterTable::flush() const {
-  return std::error_code();
-}
-
 #ifdef HAVE_ICU
 class CharSetConverterICU : public details::CharSetConverterImplBase {
   UConverter *FromConvDesc;
@@ -137,7 +132,6 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
 
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
-  std::error_code flush() const override;
 };
 
 std::error_code CharSetConverterICU::convert(StringRef Source,
@@ -180,8 +174,6 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
   return std::error_code();
 }
 
-std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
-
 #elif defined(HAVE_ICONV)
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
   iconv_t ConvDesc;
@@ -191,7 +183,6 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
 
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
-  std::error_code flush() const override;
 };
 
 std::error_code CharSetConverterIconv::convert(StringRef Source,
@@ -243,14 +234,6 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   return std::error_code();
 }
 
-std::error_code CharSetConverterIconv::flush() const {
-  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
-  if (Ret == static_cast<size_t>(-1)) {
-    return std::error_code(errno, std::generic_category());
-  }
-  return std::error_code();
-}
-
 #endif // HAVE_ICONV
 } // namespace
 

>From 3980bf5818db9833a8a1b062efd2088abcf3a67f Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 24 Apr 2024 10:43:41 -0400
Subject: [PATCH 13/14] update comments

---
 llvm/include/llvm/Support/CharSet.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 1500ccae0a24b6..55d75d25102c1d 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -51,17 +51,12 @@ class CharSetConverterImplBase {
   /// In case of an error, the result string contains the successfully converted
   /// part of the input string.
   ///
+  /// If the destination charset is a stateful character set, the shift state
+  /// will be set to the initial state.
 
   virtual std::error_code convert(StringRef Source,
                                   SmallVectorImpl<char> &Result,
                                   bool ShouldAutoFlush) const = 0;
-
-  /// Restore the conversion to the original state.
-  /// \return error code in case something went wrong
-  ///
-  /// If the destination character set is a stateful character set,
-  /// set the shift state to the initial state.
-  /// Otherwise this is a no-op.
 };
 } // namespace details
 

>From 19f64df438e816ffb565c05bae196cf76a877a17 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 29 Apr 2024 09:52:08 -0400
Subject: [PATCH 14/14] reset iconv if failed, cause overflow in testcase

---
 llvm/lib/Support/CharSet.cpp           | 3 +++
 llvm/unittests/Support/CharSetTest.cpp | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 52f00b736af451..55e0e2f1692346 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -66,6 +66,7 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
   Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
                  ? 2 * Capacity
                  : std::numeric_limits<size_t>::max();
+  Result.resize(0);
   Result.resize_for_overwrite(Capacity);
   Output = static_cast<char *>(Result.data());
   OutputLength = Capacity;
@@ -205,6 +206,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
       // An error occured. Check if we can gracefully handle it.
       if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
+        // Reset converter
+        iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
         return std::error_code();
       } else {
         // Some other error occured.
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 4628a44ef7fff2..25f3455753908b 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -156,7 +156,7 @@ TEST(CharSet, RoundTrip) {
 TEST(CharSet, ShiftState2022) {
   // Earth string.
   StringRef Src(EarthUTF);
-  SmallString<64> Dst;
+  SmallString<8> Dst;
 
   ErrorOr<CharSetConverter> ConvTo2022 =
       CharSetConverter::create("UTF-8", "ISO-2022-JP");



More information about the llvm-commits mailing list