[llvm] Create a CharSetConverter class with both iconv and icu support (PR #74516)

Mon Jan 13 07:12:28 PST 2025

https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/74516

>From bbf1cd20b1236fc4d4fffb19925382a2a4f33720 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 5 Dec 2023 15:08:43 -0500
Subject: [PATCH 01/19] Create a CharSetConverter class with both iconv and icu
 support.

---
 llvm/cmake/config-ix.cmake              |  16 +
 llvm/include/llvm/Config/config.h.cmake |   6 +
 llvm/include/llvm/Support/CharSet.h     | 160 ++++++++++
 llvm/lib/Support/CMakeLists.txt         |  17 ++
 llvm/lib/Support/CharSet.cpp            | 370 ++++++++++++++++++++++++
 llvm/unittests/Support/CMakeLists.txt   |   1 +
 llvm/unittests/Support/CharSetTest.cpp  | 281 ++++++++++++++++++
 7 files changed, 851 insertions(+)
 create mode 100644 llvm/include/llvm/Support/CharSet.h
 create mode 100644 llvm/lib/Support/CharSet.cpp
 create mode 100644 llvm/unittests/Support/CharSetTest.cpp

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 64878d28d9e1e5..0504a5b2d742ba 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -272,6 +272,22 @@ if(LLVM_HAS_LOGF128)
   set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
 endif()
 
+#Check for icu.
+find_package(ICU COMPONENTS uc i18n)
+if(ICU_FOUND)
+  set(HAVE_ICU 1)
+else()
+  set(HAVE_ICU 0)
+endif()
+
+# Check for iconv.
+find_package(Iconv)
+if(Iconv_FOUND)
+  set(HAVE_ICONV 1)
+else()
+  set(HAVE_ICONV 0)
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 3e6b94dfbe5458..77d352fc50e77e 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -279,6 +279,12 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
+/* Define if icu library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 00000000000000..856b3be65ff7ed
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,160 @@
+//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <functional>
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+public:
+  virtual ~CharSetConverterImplBase() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+
+  virtual std::error_code convert(StringRef Source,
+                                  SmallVectorImpl<char> &Result,
+                                  bool ShouldAutoFlush) const = 0;
+
+  /// Restore the conversion to the original state.
+  /// \return error code in case something went wrong
+  ///
+  /// If the original character set or the destination character set
+  /// are multi-byte character sets, set the shift state to the initial
+  /// state. Otherwise this is a no-op.
+  virtual std::error_code flush() const = 0;
+
+  virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+class CharSetConverter {
+  // details::CharSetConverterImplBase *Converter;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+  CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+      : Converter(std::move(Converter)) {}
+
+public:
+  /// Creates a CharSetConverter instance.
+  /// \param[in] CSFrom name of the source character encoding
+  /// \param[in] CSTo name of the target character encoding
+  /// \return a CharSetConverter instance
+  static CharSetConverter create(text_encoding::id CSFrom,
+                                 text_encoding::id CSTo);
+
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CPFrom name of the source character encoding
+  /// \param[in] CPTo name of the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+  CharSetConverter(const CharSetConverter &) = delete;
+  CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+  CharSetConverter(CharSetConverter &&Other) {
+    Converter = std::move(Other.Converter);
+  }
+
+  CharSetConverter &operator=(CharSetConverter &&Other) {
+    if (this != &Other)
+      Converter = std::move(Other.Converter);
+    return *this;
+  }
+
+  ~CharSetConverter() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings.
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return Converter->convert(Source, Result, ShouldAutoFlush);
+  }
+
+  char convert(char SingleChar) const {
+    SmallString<1> Result;
+    Converter->convert(StringRef(&SingleChar, 1), Result, false);
+    return Result[0];
+  }
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  std::error_code convert(const std::string &Source,
+                          SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return convert(StringRef(Source), Result, ShouldAutoFlush);
+  }
+
+  std::error_code flush() const { return Converter->flush(); }
+
+  std::error_code flush(SmallVectorImpl<char> &Result) const {
+    return Converter->flush(Result);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 2ecaea4b02bf61..87543eb66f75b9 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -160,6 +160,7 @@ add_llvm_component_library(LLVMSupport
   CachePruning.cpp
   Caching.cpp
   circular_raw_ostream.cpp
+  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
@@ -310,6 +311,22 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
+# Link icu library if it is an external library.
+if(ICU_FOUND)
+  target_link_libraries(LLVMSupport
+  PRIVATE
+  ${ICU_LIBRARIES}
+  )
+else()
+  # Link iconv library if it is an external library.
+  if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+    target_link_libraries(LLVMSupport
+    PRIVATE
+    ${Iconv_LIBRARIES}
+    )
+  endif()
+endif()
+
 set(llvm_system_libs ${system_libs})
 
 # This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 00000000000000..dbc2cb7c1839d2
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,370 @@
+//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+#define CSNAME(CS, STR)                                                        \
+  if (Normalized.equals(STR))                                                  \
+  return CS
+  CSNAME(text_encoding::id::UTF8, "utf8");
+  CSNAME(text_encoding::id::IBM1047, "ibm1047");
+#undef CSNAME
+  return std::nullopt;
+}
+
+namespace {
+enum ConversionType {
+  UTFToIBM1047,
+  IBM1047ToUTF,
+};
+
+// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+  ConversionType ConvType;
+
+public:
+  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterTable::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  if (ConvType == IBM1047ToUTF) {
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  } else if (ConvType == UTFToIBM1047) {
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+std::error_code CharSetConverterTable::flush() const {
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#ifdef HAVE_ICU
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+  UConverter *FromConvDesc;
+  UConverter *ToConvDesc;
+
+public:
+  CharSetConverterICU(UConverter *Converter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC)) {
+      ToConvDesc = nullptr;
+    }
+  };
+
+  CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      ToConvDesc = nullptr;
+  }
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterICU::convert(StringRef Source,
+                                             SmallVectorImpl<char> &Result,
+                                             bool ShouldAutoFlush) const {
+  // Setup the output. We directly write into the SmallVector.
+  size_t OutputLength, Capacity = Result.capacity();
+  char *Output, *Out;
+
+  UErrorCode EC = U_ZERO_ERROR;
+
+  auto HandleError = [&Capacity, &Output, &OutputLength,
+                      &Result](UErrorCode UEC) {
+    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
+        Capacity < std::numeric_limits<size_t>::max()) {
+      // No space left in output buffer. Double the size of the underlying
+      // memory in the SmallVectorImpl, adjust pointer and length and continue
+      // the conversion.
+      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                     ? 2 * Capacity
+                     : std::numeric_limits<size_t>::max();
+      Result.resize_for_overwrite(Capacity);
+      Output = static_cast<char *>(Result.data());
+      OutputLength = Capacity;
+      return std::error_code();
+    } else {
+      // Some other error occured.
+      return std::error_code(errno, std::generic_category());
+    }
+  };
+
+  do {
+    EC = U_ZERO_ERROR;
+    size_t InputLength = Source.size();
+    const char *Input =
+        InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    const char *In = Input;
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    OutputLength = Capacity;
+    Out = Output;
+    Result.resize_for_overwrite(Capacity);
+    ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
+                   &Input, In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+    if (U_FAILURE(EC)) {
+      if (auto error = HandleError(EC))
+        return error;
+    } else if (U_SUCCESS(EC))
+      break;
+  } while (U_FAILURE(EC));
+
+  Result.resize(Output - Out);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
+
+std::error_code
+CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+  iconv_t ConvDesc;
+
+public:
+  CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  // Setup the input. Use nullptr to reset iconv state if input length is zero.
+  size_t InputLength = Source.size();
+  char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  size_t OutputLength = Capacity;
+
+  size_t Ret;
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Double the size of the underlying
+        // memory in the SmallVectorImpl, adjust pointer and length and continue
+        // the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                       ? 2 * Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  // Convert the string.
+  while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+  if (ShouldAutoFlush) {
+    while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+      if (auto EC = HandleError(Ret))
+        return EC;
+  }
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+  if (Ret == static_cast<size_t>(-1)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+  char *Output = Result.data();
+  size_t OutputLength = Result.capacity();
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Increase the size of the underlying
+        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+        // and continue the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+                       ? 2 + Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  size_t Ret;
+  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
+                                          text_encoding::id CPTo) {
+
+  assert(CPFrom != CPTo && "Text encodings should be distinct");
+
+  ConversionType Conversion;
+  if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+    Conversion = UTFToIBM1047;
+  else
+    Conversion = IBM1047ToUTF;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterTable>(Conversion);
+
+  return CharSetConverter(std::move(Converter));
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+                                                   StringRef CSTo) {
+  std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
+  std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+  if (From && To)
+    return create(*From, *To);
+#ifdef HAVE_ICU
+  UErrorCode EC = U_ZERO_ERROR;
+  UConverter *FromConvDesc = ucnv_open(CSFrom.str().c_str(), &EC);
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  UConverter *ToConvDesc = ucnv_open(CSTo.str().c_str(), &EC);
+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterICU>(FromConvDesc, ToConvDesc);
+  return CharSetConverter(std::move(Converter));
+#elif defined(HAVE_ICONV)
+  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  if (ConvDesc == (iconv_t)-1)
+    return std::error_code(errno, std::generic_category());
+  std::unique_ptr<details::CharSetConverterImplBase> Converter =
+      std::make_unique<CharSetConverterIconv>(ConvDesc);
+  return CharSetConverter(std::move(Converter));
+#endif
+  return std::make_error_code(std::errc::invalid_argument);
+}
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 6de81658264420..2fc70604109a19 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(SupportTests
   BalancedPartitioningTest.cpp
   BranchProbabilityTest.cpp
   CachePruningTest.cpp
+  CharSetTest.cpp
   CrashRecoveryTest.cpp
   Casting.cpp
   CheckedArithmeticTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
new file mode 100644
index 00000000000000..2f2d8f97102b98
--- /dev/null
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,281 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+    "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+    "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+    "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+    "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+    "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+    "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+    "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+    "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+    "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+    "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+    "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+                              "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+// Identical to above, except the final character (球) has its last byte taken
+// away from it.
+static const char EarthISO2022[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+    "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, FromUTF8) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8,
+                                                   text_encoding::id::IBM1047);
+  std::error_code EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrA;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentUTF;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Cyrillic string. Results in error because not representable in 1047.
+  Src = CyrillicUTF;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(CharSet, ToUTF8) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047,
+                                                   text_encoding::id::UTF8);
+  std::error_code EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // ABC string.
+  Src = ABCStrE;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+  Dst.clear();
+
+  // Accent string.
+  Src = AccentE;
+  EC = Conv.convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+  ErrorOr<CharSetConverter> ConvToUTF16 =
+      CharSetConverter::create("IBM-1047", "UTF-16");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToUTF32 =
+      CharSetConverter::create("UTF-16", "UTF-32");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToEBCDIC =
+      CharSetConverter::create("UTF-32", "IBM-1047");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+  // Earth string.
+  StringRef Src(EarthUTF);
+  SmallString<64> Dst;
+
+  ErrorOr<CharSetConverter> ConvToIBM939 =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToIBM939) {
+    ASSERT_EQ(ConvToIBM939.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
+
+// Identical to EarthUTF, except the final character (球) has its last byte
+// taken away from it.
+static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
+static const char EarthISO2022ShiftBack[] =
+    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
+static const char ShiftBackOnly[] = "\x1B\x28\x42";
+
+// String "地球".
+static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthKanjiOnlyISO2022[] =
+    "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
+static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, ShiftState2022Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo2022Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939Flush) {
+  StringRef Src0(EarthUTFBroken);
+  StringRef Src1(EarthKanjiOnlyUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo939Flush =
+      CharSetConverter::create("UTF-8", "IBM-939");
+  if (!ConvTo939Flush) {
+    ASSERT_EQ(ConvTo939Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // This should emit an error; there is a malformed multibyte character in the
+  // input string.
+  std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
+  EXPECT_TRUE(EC0);
+  std::error_code EC1 = ConvTo939Flush->flush();
+  EXPECT_TRUE(!EC1);
+  std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
+  EXPECT_TRUE(!EC2);
+  EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush1) {
+  StringRef Src0(EarthUTF);
+  SmallString<64> Dst0;
+  SmallString<64> Dst1;
+  ErrorOr<CharSetConverter> ConvTo2022Flush =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  if (!ConvTo2022Flush) {
+    ASSERT_EQ(ConvTo2022Flush.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
+  EXPECT_TRUE(!EC0);
+  EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
+  std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
+  EXPECT_TRUE(!EC1);
+  EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
+}
+
+#endif
+
+} // namespace

>From 1dbd35bb1893640b092a5fc16104537bf468d291 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 9 Jan 2024 14:47:24 -0500
Subject: [PATCH 02/19] address review comments

---
 llvm/include/llvm/Support/CharSet.h |  38 ++++-----
 llvm/lib/Support/CharSet.cpp        | 116 ++++++++--------------------
 2 files changed, 49 insertions(+), 105 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 856b3be65ff7ed..fd077191c235b5 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -1,4 +1,4 @@
-//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -35,9 +35,9 @@ class CharSetConverterImplBase {
 
   /// Converts a string.
   /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
+  /// \param[out] Result container for converted string
   /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings iff true.
+  /// for stateful encodings if true.
   /// \return error code in case something went wrong
   ///
   /// The following error codes can occur, among others:
@@ -59,9 +59,9 @@ class CharSetConverterImplBase {
   /// Restore the conversion to the original state.
   /// \return error code in case something went wrong
   ///
-  /// If the original character set or the destination character set
-  /// are multi-byte character sets, set the shift state to the initial
-  /// state. Otherwise this is a no-op.
+  /// If the destination character set is a stateful character set,
+  /// set the shift state to the initial state.
+  /// Otherwise this is a no-op.
   virtual std::error_code flush() const = 0;
 
   virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
@@ -80,7 +80,6 @@ enum class id {
 } // end namespace text_encoding
 
 /// Utility class to convert between different character set encodings.
-/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
 class CharSetConverter {
   // details::CharSetConverterImplBase *Converter;
   std::unique_ptr<details::CharSetConverterImplBase> Converter;
@@ -121,33 +120,30 @@ class CharSetConverter {
 
   /// Converts a string.
   /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
+  /// \param[out] Result container for converted string
   /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings.
+  /// for stateful encodings.
   /// \return error code in case something went wrong
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush = true) const {
     return Converter->convert(Source, Result, ShouldAutoFlush);
   }
 
+  ErrorOr<std::string> convert(StringRef Source,
+                               bool ShouldAutoFlush = true) const {
+    SmallString<1> Result;
+    auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
+    if (!EC)
+      return std::string(Result);
+    return EC;
+  }
+
   char convert(char SingleChar) const {
     SmallString<1> Result;
     Converter->convert(StringRef(&SingleChar, 1), Result, false);
     return Result[0];
   }
 
-  /// Converts a string.
-  /// \param[in] Source source string
-  /// \param[in,out] Result container for converted string
-  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for multi-byte encodings iff true.
-  /// \return error code in case something went wrong
-  std::error_code convert(const std::string &Source,
-                          SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush = true) const {
-    return convert(StringRef(Source), Result, ShouldAutoFlush);
-  }
-
   std::error_code flush() const { return Converter->flush(); }
 
   std::error_code flush(SmallVectorImpl<char> &Result) const {
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index dbc2cb7c1839d2..1a49d665fdbda0 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -1,4 +1,4 @@
-//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -32,7 +32,8 @@ using namespace llvm;
 
 // Normalize the charset name with the charset alias matching algorithm proposed
 // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
-void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
   bool PrevDigit = false;
   for (auto Ch : CSName) {
     if (isAlnum(Ch)) {
@@ -49,15 +50,26 @@ void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
 std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
-#define CSNAME(CS, STR)                                                        \
-  if (Normalized.equals(STR))                                                  \
-  return CS
-  CSNAME(text_encoding::id::UTF8, "utf8");
-  CSNAME(text_encoding::id::IBM1047, "ibm1047");
-#undef CSNAME
+  if (Normalized.equals("utf8"))
+    return text_encoding::id::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return text_encoding::id::IBM1047;
   return std::nullopt;
 }
 
+void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+                    SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                 ? 2 * Capacity
+                 : std::numeric_limits<size_t>::max();
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
 namespace {
 enum ConversionType {
   UTFToIBM1047,
@@ -138,31 +150,12 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                                              SmallVectorImpl<char> &Result,
                                              bool ShouldAutoFlush) const {
   // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
   size_t OutputLength, Capacity = Result.capacity();
   char *Output, *Out;
 
   UErrorCode EC = U_ZERO_ERROR;
 
-  auto HandleError = [&Capacity, &Output, &OutputLength,
-                      &Result](UErrorCode UEC) {
-    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
-        Capacity < std::numeric_limits<size_t>::max()) {
-      // No space left in output buffer. Double the size of the underlying
-      // memory in the SmallVectorImpl, adjust pointer and length and continue
-      // the conversion.
-      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                     ? 2 * Capacity
-                     : std::numeric_limits<size_t>::max();
-      Result.resize_for_overwrite(Capacity);
-      Output = static_cast<char *>(Result.data());
-      OutputLength = Capacity;
-      return std::error_code();
-    } else {
-      // Some other error occured.
-      return std::error_code(errno, std::generic_category());
-    }
-  };
-
   do {
     EC = U_ZERO_ERROR;
     size_t InputLength = Source.size();
@@ -176,10 +169,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
-                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+                   /*pivotLimit=*/NULL, /*reset=*/true,
+                   /*flush=*/ShouldAutoFlush, &EC);
     if (U_FAILURE(EC)) {
-      if (auto error = HandleError(EC))
-        return error;
+      if (EC == U_BUFFER_OVERFLOW_ERROR &&
+          Capacity < std::numeric_limits<size_t>::max())
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+      else
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
     } else if (U_SUCCESS(EC))
       break;
   } while (U_FAILURE(EC));
@@ -215,8 +213,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   size_t InputLength = Source.size();
   char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
   // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
   size_t Capacity = Result.capacity();
-  Result.resize_for_overwrite(Capacity);
   char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
   size_t OutputLength = Capacity;
 
@@ -227,16 +225,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
     if (Ret == static_cast<size_t>(-1)) {
       // An error occured. Check if we can gracefully handle it.
       if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
-        // No space left in output buffer. Double the size of the underlying
-        // memory in the SmallVectorImpl, adjust pointer and length and continue
-        // the conversion.
-        const size_t Used = Capacity - OutputLength;
-        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
-                       ? 2 * Capacity
-                       : std::numeric_limits<size_t>::max();
-        Result.resize_for_overwrite(Capacity);
-        Output = static_cast<char *>(Result.data()) + Used;
-        OutputLength = Capacity - Used;
+        HandleOverflow(Capacity, Output, OutputLength, Result);
         return std::error_code();
       } else {
         // Some other error occured.
@@ -276,48 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
 
 std::error_code
 CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  char *Output = Result.data();
-  size_t OutputLength = Result.capacity();
-  size_t Capacity = Result.capacity();
-  Result.resize_for_overwrite(Capacity);
-
-  // Handle errors returned from iconv().
-  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
-    if (Ret == static_cast<size_t>(-1)) {
-      // An error occured. Check if we can gracefully handle it.
-      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
-        // No space left in output buffer. Increase the size of the underlying
-        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
-        // and continue the conversion.
-        const size_t Used = Capacity - OutputLength;
-        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
-                       ? 2 + Capacity
-                       : std::numeric_limits<size_t>::max();
-        Result.resize_for_overwrite(Capacity);
-        Output = static_cast<char *>(Result.data()) + Used;
-        OutputLength = Capacity - Used;
-        return std::error_code();
-      } else {
-        // Some other error occured.
-        return std::error_code(errno, std::generic_category());
-      }
-    } else {
-      // A positive return value indicates that some characters were converted
-      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
-      // an error in this case makes sure that both conversion routines behave
-      // in the same way.
-      return std::make_error_code(std::errc::illegal_byte_sequence);
-    }
-  };
-
-  size_t Ret;
-  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
-    if (auto EC = HandleError(Ret))
-      return EC;
-
-  // Re-adjust size to actual size.
-  Result.resize(Capacity - OutputLength);
-  return std::error_code();
+  return convert(nullptr, Result);
 }
 
 #endif // HAVE_ICONV

>From c864288e945da4d31c3387f2efae25d84b8f4d41 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 10 Jan 2024 09:52:43 -0500
Subject: [PATCH 03/19] add LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV option

---
 llvm/CMakeLists.txt        |  4 ++++
 llvm/cmake/config-ix.cmake | 24 ++++++++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f14065ab037990..118817f19ff2fa 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -560,6 +560,10 @@ else()
   option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 endif()
 
+set(LLVM_ENABLE_ICU "ON" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "ON" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 0504a5b2d742ba..3d0eecb65e7089 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -273,19 +273,23 @@ if(LLVM_HAS_LOGF128)
 endif()
 
 #Check for icu.
-find_package(ICU COMPONENTS uc i18n)
-if(ICU_FOUND)
-  set(HAVE_ICU 1)
-else()
-  set(HAVE_ICU 0)
+if(LLVM_ENABLE_ICU)
+  find_package(ICU COMPONENTS uc i18n)
+  if(ICU_FOUND)
+    set(HAVE_ICU 1)
+  else()
+    set(HAVE_ICU 0)
+  endif()
 endif()
 
 # Check for iconv.
-find_package(Iconv)
-if(Iconv_FOUND)
-  set(HAVE_ICONV 1)
-else()
-  set(HAVE_ICONV 0)
+if(LLVM_ENABLE_ICONV)
+  find_package(Iconv)
+  if(Iconv_FOUND)
+    set(HAVE_ICONV 1)
+  else()
+    set(HAVE_ICONV 0)
+  endif()
 endif()
 
 # function checks

>From 710f7450fded614455ff2d7ed16f40c89119b343 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 31 Jan 2024 14:06:50 -0500
Subject: [PATCH 04/19] remove single char conversion function

---
 llvm/include/llvm/Support/CharSet.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index fd077191c235b5..e573b3da9d7cc3 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -138,12 +138,6 @@ class CharSetConverter {
     return EC;
   }
 
-  char convert(char SingleChar) const {
-    SmallString<1> Result;
-    Converter->convert(StringRef(&SingleChar, 1), Result, false);
-    return Result[0];
-  }
-
   std::error_code flush() const { return Converter->flush(); }
 
   std::error_code flush(SmallVectorImpl<char> &Result) const {

>From df4ee4370cd2831caf6688c94a01b0c2f25cd4c1 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 23 Feb 2024 13:35:34 -0500
Subject: [PATCH 05/19] handle FORCE_ON, look for shared libraries only for ICU

---
 llvm/cmake/config-ix.cmake | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 3d0eecb65e7089..3366cdd733d700 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -274,22 +274,31 @@ endif()
 
 #Check for icu.
 if(LLVM_ENABLE_ICU)
-  find_package(ICU COMPONENTS uc i18n)
-  if(ICU_FOUND)
-    set(HAVE_ICU 1)
+  set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ".so")
+  if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+    find_package(ICU REQUIRED COMPONENTS uc i18n)
+    if (NOT ICU_FOUND)
+      message(FATAL_ERROR "Failed to configure icu, but LLVM_ENABLE_ICU is FORCE_ON")
+    endif()
   else()
-    set(HAVE_ICU 0)
+    find_package(ICU COMPONENTS uc i18n)
   endif()
+  set(HAVE_ICU ${ICU_FOUND})
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
 endif()
 
 # Check for iconv.
 if(LLVM_ENABLE_ICONV)
-  find_package(Iconv)
-  if(Iconv_FOUND)
-    set(HAVE_ICONV 1)
+  if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+    find_package(Iconv REQUIRED)
+    if (NOT Iconv_FOUND)
+      message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+    endif()
   else()
-    set(HAVE_ICONV 0)
+    find_package(Iconv)
   endif()
+  set(HAVE_ICONV ${Iconv_FOUND})
 endif()
 
 # function checks

>From 4bbf9b0d87ffb48fe955f03bf9e4aba68fcfc153 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 4 Apr 2024 13:45:13 -0400
Subject: [PATCH 06/19] only allow builtin iconv support

---
 llvm/cmake/config-ix.cmake      | 4 +++-
 llvm/lib/Support/CMakeLists.txt | 8 --------
 llvm/lib/Support/CharSet.cpp    | 4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 3366cdd733d700..eee54d92b78f4e 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -298,7 +298,9 @@ if(LLVM_ENABLE_ICONV)
   else()
     find_package(Iconv)
   endif()
-  set(HAVE_ICONV ${Iconv_FOUND})
+  if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
+    set(HAVE_ICONV 1)
+  endif()
 endif()
 
 # function checks
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 87543eb66f75b9..ce506b9b226da3 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -317,14 +317,6 @@ if(ICU_FOUND)
   PRIVATE
   ${ICU_LIBRARIES}
   )
-else()
-  # Link iconv library if it is an external library.
-  if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
-    target_link_libraries(LLVMSupport
-    PRIVATE
-    ${Iconv_LIBRARIES}
-    )
-  endif()
 endif()
 
 set(llvm_system_libs ${system_libs})
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 1a49d665fdbda0..330f420c452232 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -215,7 +215,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   // Setup the output. We directly write into the SmallVector.
   Result.resize_for_overwrite(Source.size());
   size_t Capacity = Result.capacity();
-  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  char *Output = static_cast<char *>(Result.data());
   size_t OutputLength = Capacity;
 
   size_t Ret;
@@ -265,7 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
 
 std::error_code
 CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  return convert(nullptr, Result);
+  return convert("", Result, /*ShouldAutoFlush=*/true);
 }
 
 #endif // HAVE_ICONV

>From 0501f35f7ff30523cb187147e4aee4f2d7cfbe77 Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree at users.noreply.github.com>
Date: Wed, 17 Apr 2024 08:33:03 -0400
Subject: [PATCH 07/19] Update llvm/cmake/config-ix.cmake

Co-authored-by: Eli Friedman <efriedma at quicinc.com>
---
 llvm/cmake/config-ix.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index eee54d92b78f4e..492941bf32021f 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -292,7 +292,7 @@ endif()
 if(LLVM_ENABLE_ICONV)
   if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
     find_package(Iconv REQUIRED)
-    if (NOT Iconv_FOUND)
+    if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
       message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
     endif()
   else()

>From 81db614057665a199ab88bc49ffd1b5c6b4851bf Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 19 Apr 2024 11:55:35 -0400
Subject: [PATCH 08/19] address comments

---
 llvm/include/llvm/Support/CharSet.h |  2 +-
 llvm/lib/Support/CharSet.cpp        | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index e573b3da9d7cc3..b2c50cd423d6da 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -131,7 +131,7 @@ class CharSetConverter {
 
   ErrorOr<std::string> convert(StringRef Source,
                                bool ShouldAutoFlush = true) const {
-    SmallString<1> Result;
+    SmallString<100> Result;
     auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
     if (!EC)
       return std::string(Result);
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 330f420c452232..55eb5f3f1ad6d6 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -47,7 +47,7 @@ static void normalizeCharSetName(StringRef CSName,
 }
 
 // Maps the charset name to enum constant if possible.
-std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   SmallString<16> Normalized;
   normalizeCharSetName(CSName, Normalized);
   if (Normalized.equals("utf8"))
@@ -57,8 +57,9 @@ std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   return std::nullopt;
 }
 
-void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
-                    SmallVectorImpl<char> &Result) {
+static void HandleOverflow(size_t &Capacity, char *&Output,
+                           size_t &OutputLength,
+                           SmallVectorImpl<char> &Result) {
   // No space left in output buffer. Double the size of the underlying
   // memory in the SmallVectorImpl, adjust pointer and length and continue
   // the conversion.
@@ -150,9 +151,10 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                                              SmallVectorImpl<char> &Result,
                                              bool ShouldAutoFlush) const {
   // Setup the output. We directly write into the SmallVector.
-  Result.resize_for_overwrite(Source.size());
-  size_t OutputLength, Capacity = Result.capacity();
+  size_t Capacity = Result.capacity();
+  size_t OutputLength = Capacity;
   char *Output, *Out;
+  Result.resize_for_overwrite(Capacity);
 
   UErrorCode EC = U_ZERO_ERROR;
 
@@ -163,9 +165,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
         InputLength ? const_cast<char *>(Source.data()) : nullptr;
     const char *In = Input;
     Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
-    OutputLength = Capacity;
     Out = Output;
-    Result.resize_for_overwrite(Capacity);
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
@@ -177,7 +177,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
         HandleOverflow(Capacity, Output, OutputLength, Result);
       else
         // Some other error occured.
-        return std::error_code(errno, std::generic_category());
+        return std::error_code(EILSEQ, std::generic_category());
     } else if (U_SUCCESS(EC))
       break;
   } while (U_FAILURE(EC));

>From 6926d09ed05aae337a3a6139a4b29df162af13a6 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 22 Apr 2024 13:27:47 -0400
Subject: [PATCH 09/19] remove function to get shift back characters, address
 comments

---
 llvm/include/llvm/Support/CharSet.h    |  6 --
 llvm/lib/Support/CharSet.cpp           | 31 +++-------
 llvm/unittests/Support/CharSetTest.cpp | 85 --------------------------
 3 files changed, 7 insertions(+), 115 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index b2c50cd423d6da..b3bc138518b1a0 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -63,8 +63,6 @@ class CharSetConverterImplBase {
   /// set the shift state to the initial state.
   /// Otherwise this is a no-op.
   virtual std::error_code flush() const = 0;
-
-  virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
 };
 } // namespace details
 
@@ -139,10 +137,6 @@ class CharSetConverter {
   }
 
   std::error_code flush() const { return Converter->flush(); }
-
-  std::error_code flush(SmallVectorImpl<char> &Result) const {
-    return Converter->flush(Result);
-  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 55eb5f3f1ad6d6..73cd34b535b106 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -91,7 +91,6 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
   std::error_code flush() const override;
-  std::error_code flush(SmallVectorImpl<char> &Result) const override;
 };
 
 std::error_code CharSetConverterTable::convert(StringRef Source,
@@ -111,11 +110,6 @@ std::error_code CharSetConverterTable::flush() const {
   return std::error_code();
 }
 
-std::error_code
-CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
-  return std::error_code();
-}
-
 #ifdef HAVE_ICU
 class CharSetConverterICU : public details::CharSetConverterImplBase {
   UConverter *FromConvDesc;
@@ -144,7 +138,6 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
   std::error_code flush() const override;
-  std::error_code flush(SmallVectorImpl<char> &Result) const override;
 };
 
 std::error_code CharSetConverterICU::convert(StringRef Source,
@@ -164,7 +157,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
     const char *Input =
         InputLength ? const_cast<char *>(Source.data()) : nullptr;
     const char *In = Input;
-    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    Output = static_cast<char *>(Result.data());
     Out = Output;
     ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
@@ -173,14 +166,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                    /*flush=*/ShouldAutoFlush, &EC);
     if (U_FAILURE(EC)) {
       if (EC == U_BUFFER_OVERFLOW_ERROR &&
-          Capacity < std::numeric_limits<size_t>::max())
+          Capacity < std::numeric_limits<size_t>::max()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
-      else
+        continue;
+      } else
         // Some other error occured.
         return std::error_code(EILSEQ, std::generic_category());
-    } else if (U_SUCCESS(EC))
-      break;
-  } while (U_FAILURE(EC));
+    }
+    break;
+  } while (true);
 
   Result.resize(Output - Out);
   return std::error_code();
@@ -188,11 +182,6 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
 
 std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
 
-std::error_code
-CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
-  return std::error_code();
-}
-
 #elif defined(HAVE_ICONV)
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
   iconv_t ConvDesc;
@@ -203,7 +192,6 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
   std::error_code flush() const override;
-  std::error_code flush(SmallVectorImpl<char> &Result) const override;
 };
 
 std::error_code CharSetConverterIconv::convert(StringRef Source,
@@ -263,11 +251,6 @@ std::error_code CharSetConverterIconv::flush() const {
   return std::error_code();
 }
 
-std::error_code
-CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
-  return convert("", Result, /*ShouldAutoFlush=*/true);
-}
-
 #endif // HAVE_ICONV
 } // namespace
 
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 2f2d8f97102b98..4628a44ef7fff2 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -193,89 +193,4 @@ TEST(CharSet, ShiftStateIBM939) {
   EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
 }
 
-#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
-
-// Identical to EarthUTF, except the final character (球) has its last byte
-// taken away from it.
-static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
-static const char EarthISO2022ShiftBack[] =
-    "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
-static const char ShiftBackOnly[] = "\x1B\x28\x42";
-
-// String "地球".
-static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
-static const char EarthKanjiOnlyISO2022[] =
-    "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
-static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
-
-TEST(CharSet, ShiftState2022Flush) {
-  StringRef Src0(EarthUTFBroken);
-  StringRef Src1(EarthKanjiOnlyUTF);
-  SmallString<64> Dst0;
-  SmallString<64> Dst1;
-  ErrorOr<CharSetConverter> ConvTo2022Flush =
-      CharSetConverter::create("UTF-8", "ISO-2022-JP");
-  if (!ConvTo2022Flush) {
-    ASSERT_EQ(ConvTo2022Flush.getError(),
-              std::make_error_code(std::errc::invalid_argument));
-    return;
-  }
-
-  // This should emit an error; there is a malformed multibyte character in the
-  // input string.
-  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
-  EXPECT_TRUE(EC0);
-  std::error_code EC1 = ConvTo2022Flush->flush();
-  EXPECT_TRUE(!EC1);
-  std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
-  EXPECT_TRUE(!EC2);
-  EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
-}
-
-TEST(CharSet, ShiftStateIBM939Flush) {
-  StringRef Src0(EarthUTFBroken);
-  StringRef Src1(EarthKanjiOnlyUTF);
-  SmallString<64> Dst0;
-  SmallString<64> Dst1;
-  ErrorOr<CharSetConverter> ConvTo939Flush =
-      CharSetConverter::create("UTF-8", "IBM-939");
-  if (!ConvTo939Flush) {
-    ASSERT_EQ(ConvTo939Flush.getError(),
-              std::make_error_code(std::errc::invalid_argument));
-    return;
-  }
-
-  // This should emit an error; there is a malformed multibyte character in the
-  // input string.
-  std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
-  EXPECT_TRUE(EC0);
-  std::error_code EC1 = ConvTo939Flush->flush();
-  EXPECT_TRUE(!EC1);
-  std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
-  EXPECT_TRUE(!EC2);
-  EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
-}
-
-TEST(CharSet, ShiftState2022Flush1) {
-  StringRef Src0(EarthUTF);
-  SmallString<64> Dst0;
-  SmallString<64> Dst1;
-  ErrorOr<CharSetConverter> ConvTo2022Flush =
-      CharSetConverter::create("UTF-8", "ISO-2022-JP");
-  if (!ConvTo2022Flush) {
-    ASSERT_EQ(ConvTo2022Flush.getError(),
-              std::make_error_code(std::errc::invalid_argument));
-    return;
-  }
-
-  std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
-  EXPECT_TRUE(!EC0);
-  EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
-  std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
-  EXPECT_TRUE(!EC1);
-  EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
-}
-
-#endif
-
 } // namespace

>From 6f558d9dee6a87100111b782c761743ebd178ab0 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 23 Apr 2024 08:09:16 -0400
Subject: [PATCH 10/19] remove other flush function as well

---
 llvm/include/llvm/Support/CharSet.h |  3 ---
 llvm/lib/Support/CharSet.cpp        | 17 -----------------
 2 files changed, 20 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index b3bc138518b1a0..1500ccae0a24b6 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -62,7 +62,6 @@ class CharSetConverterImplBase {
   /// If the destination character set is a stateful character set,
   /// set the shift state to the initial state.
   /// Otherwise this is a no-op.
-  virtual std::error_code flush() const = 0;
 };
 } // namespace details
 
@@ -135,8 +134,6 @@ class CharSetConverter {
       return std::string(Result);
     return EC;
   }
-
-  std::error_code flush() const { return Converter->flush(); }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 73cd34b535b106..52f00b736af451 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -90,7 +90,6 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
-  std::error_code flush() const override;
 };
 
 std::error_code CharSetConverterTable::convert(StringRef Source,
@@ -106,10 +105,6 @@ std::error_code CharSetConverterTable::convert(StringRef Source,
   return std::error_code();
 }
 
-std::error_code CharSetConverterTable::flush() const {
-  return std::error_code();
-}
-
 #ifdef HAVE_ICU
 class CharSetConverterICU : public details::CharSetConverterImplBase {
   UConverter *FromConvDesc;
@@ -137,7 +132,6 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
 
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
-  std::error_code flush() const override;
 };
 
 std::error_code CharSetConverterICU::convert(StringRef Source,
@@ -180,8 +174,6 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
   return std::error_code();
 }
 
-std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
-
 #elif defined(HAVE_ICONV)
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
   iconv_t ConvDesc;
@@ -191,7 +183,6 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
 
   std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
                           bool ShouldAutoFlush) const override;
-  std::error_code flush() const override;
 };
 
 std::error_code CharSetConverterIconv::convert(StringRef Source,
@@ -243,14 +234,6 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   return std::error_code();
 }
 
-std::error_code CharSetConverterIconv::flush() const {
-  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
-  if (Ret == static_cast<size_t>(-1)) {
-    return std::error_code(errno, std::generic_category());
-  }
-  return std::error_code();
-}
-
 #endif // HAVE_ICONV
 } // namespace
 

>From 53be2d6050f1492e2364969760348fdb4869ea71 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 24 Apr 2024 10:43:41 -0400
Subject: [PATCH 11/19] update comments

---
 llvm/include/llvm/Support/CharSet.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 1500ccae0a24b6..55d75d25102c1d 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -51,17 +51,12 @@ class CharSetConverterImplBase {
   /// In case of an error, the result string contains the successfully converted
   /// part of the input string.
   ///
+  /// If the destination charset is a stateful character set, the shift state
+  /// will be set to the initial state.
 
   virtual std::error_code convert(StringRef Source,
                                   SmallVectorImpl<char> &Result,
                                   bool ShouldAutoFlush) const = 0;
-
-  /// Restore the conversion to the original state.
-  /// \return error code in case something went wrong
-  ///
-  /// If the destination character set is a stateful character set,
-  /// set the shift state to the initial state.
-  /// Otherwise this is a no-op.
 };
 } // namespace details
 

>From b99dca555e75a370fe0f1b798906b9307e075c9c Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 29 Apr 2024 09:52:08 -0400
Subject: [PATCH 12/19] reset iconv if failed, cause overflow in testcase

---
 llvm/lib/Support/CharSet.cpp           | 3 +++
 llvm/unittests/Support/CharSetTest.cpp | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 52f00b736af451..55e0e2f1692346 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -66,6 +66,7 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
   Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
                  ? 2 * Capacity
                  : std::numeric_limits<size_t>::max();
+  Result.resize(0);
   Result.resize_for_overwrite(Capacity);
   Output = static_cast<char *>(Result.data());
   OutputLength = Capacity;
@@ -205,6 +206,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
       // An error occured. Check if we can gracefully handle it.
       if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
+        // Reset converter
+        iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
         return std::error_code();
       } else {
         // Some other error occured.
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 4628a44ef7fff2..25f3455753908b 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -156,7 +156,7 @@ TEST(CharSet, RoundTrip) {
 TEST(CharSet, ShiftState2022) {
   // Earth string.
   StringRef Src(EarthUTF);
-  SmallString<64> Dst;
+  SmallString<8> Dst;
 
   ErrorOr<CharSetConverter> ConvTo2022 =
       CharSetConverter::create("UTF-8", "ISO-2022-JP");

>From 290e2bc65fb944d27853c6f3e4e5e0b8149f802e Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 29 Apr 2024 15:03:39 -0400
Subject: [PATCH 13/19] remove AutoFlush, remove stray comment

---
 llvm/include/llvm/Support/CharSet.h    | 18 ++++-------
 llvm/lib/Support/CharSet.cpp           | 43 +++++++++++++-------------
 llvm/unittests/Support/CharSetTest.cpp | 26 +++++++---------
 3 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 55d75d25102c1d..c1089b744ef764 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -36,8 +36,6 @@ class CharSetConverterImplBase {
   /// Converts a string.
   /// \param[in] Source source string
   /// \param[out] Result container for converted string
-  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for stateful encodings if true.
   /// \return error code in case something went wrong
   ///
   /// The following error codes can occur, among others:
@@ -55,8 +53,7 @@ class CharSetConverterImplBase {
   /// will be set to the initial state.
 
   virtual std::error_code convert(StringRef Source,
-                                  SmallVectorImpl<char> &Result,
-                                  bool ShouldAutoFlush) const = 0;
+                                  SmallVectorImpl<char> &Result) const = 0;
 };
 } // namespace details
 
@@ -113,18 +110,15 @@ class CharSetConverter {
   /// Converts a string.
   /// \param[in] Source source string
   /// \param[out] Result container for converted string
-  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
-  /// for stateful encodings.
   /// \return error code in case something went wrong
-  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush = true) const {
-    return Converter->convert(Source, Result, ShouldAutoFlush);
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const {
+    return Converter->convert(Source, Result);
   }
 
-  ErrorOr<std::string> convert(StringRef Source,
-                               bool ShouldAutoFlush = true) const {
+  ErrorOr<std::string> convert(StringRef Source) const {
     SmallString<100> Result;
-    auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
+    auto EC = Converter->convert(Source, Result);
     if (!EC)
       return std::string(Result);
     return EC;
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 55e0e2f1692346..c00a1894e91708 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -57,6 +57,7 @@ static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
   return std::nullopt;
 }
 
+#if defined(HAVE_ICONV) || defined(HAVE_ICU)
 static void HandleOverflow(size_t &Capacity, char *&Output,
                            size_t &OutputLength,
                            SmallVectorImpl<char> &Result) {
@@ -71,6 +72,7 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
   Output = static_cast<char *>(Result.data());
   OutputLength = Capacity;
 }
+#endif
 
 namespace {
 enum ConversionType {
@@ -89,13 +91,13 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 public:
   CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
 
-  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush) const override;
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const override;
 };
 
-std::error_code CharSetConverterTable::convert(StringRef Source,
-                                               SmallVectorImpl<char> &Result,
-                                               bool ShouldAutoFlush) const {
+std::error_code
+CharSetConverterTable::convert(StringRef Source,
+                               SmallVectorImpl<char> &Result) const {
   if (ConvType == IBM1047ToUTF) {
     ConverterEBCDIC::convertToUTF8(Source, Result);
     return std::error_code();
@@ -131,13 +133,13 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
       ToConvDesc = nullptr;
   }
 
-  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush) const override;
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const override;
 };
 
-std::error_code CharSetConverterICU::convert(StringRef Source,
-                                             SmallVectorImpl<char> &Result,
-                                             bool ShouldAutoFlush) const {
+std::error_code
+CharSetConverterICU::convert(StringRef Source,
+                             SmallVectorImpl<char> &Result) const {
   // Setup the output. We directly write into the SmallVector.
   size_t Capacity = Result.capacity();
   size_t OutputLength = Capacity;
@@ -158,7 +160,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
                    &Input, In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
                    /*pivotLimit=*/NULL, /*reset=*/true,
-                   /*flush=*/ShouldAutoFlush, &EC);
+                   /*flush=*/true, &EC);
     if (U_FAILURE(EC)) {
       if (EC == U_BUFFER_OVERFLOW_ERROR &&
           Capacity < std::numeric_limits<size_t>::max()) {
@@ -182,13 +184,13 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
 public:
   CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
 
-  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
-                          bool ShouldAutoFlush) const override;
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const override;
 };
 
-std::error_code CharSetConverterIconv::convert(StringRef Source,
-                                               SmallVectorImpl<char> &Result,
-                                               bool ShouldAutoFlush) const {
+std::error_code
+CharSetConverterIconv::convert(StringRef Source,
+                               SmallVectorImpl<char> &Result) const {
   // Setup the input. Use nullptr to reset iconv state if input length is zero.
   size_t InputLength = Source.size();
   char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
@@ -226,11 +228,10 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
   while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
     if (auto EC = HandleError(Ret))
       return EC;
-  if (ShouldAutoFlush) {
-    while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
-      if (auto EC = HandleError(Ret))
-        return EC;
-  }
+  // Flush the converter
+  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
 
   // Re-adjust size to actual size.
   Result.resize(Capacity - OutputLength);
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 25f3455753908b..579e21a86e18e5 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -46,8 +46,6 @@ static const char CyrillicUTF[] = "\xd0\xaf";
 // IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
 // back.
 static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
-// Identical to above, except the final character (球) has its last byte taken
-// away from it.
 static const char EarthISO2022[] =
     "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
 static const char EarthIBM939[] =
@@ -60,28 +58,28 @@ TEST(CharSet, FromUTF8) {
 
   CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8,
                                                    text_encoding::id::IBM1047);
-  std::error_code EC = Conv.convert(Src, Dst, true);
+  std::error_code EC = Conv.convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
   Dst.clear();
 
   // ABC string.
   Src = ABCStrA;
-  EC = Conv.convert(Src, Dst, true);
+  EC = Conv.convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
   Dst.clear();
 
   // Accent string.
   Src = AccentUTF;
-  EC = Conv.convert(Src, Dst, true);
+  EC = Conv.convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
   Dst.clear();
 
   // Cyrillic string. Results in error because not representable in 1047.
   Src = CyrillicUTF;
-  EC = Conv.convert(Src, Dst, true);
+  EC = Conv.convert(Src, Dst);
   EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
 }
 
@@ -92,21 +90,21 @@ TEST(CharSet, ToUTF8) {
 
   CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047,
                                                    text_encoding::id::UTF8);
-  std::error_code EC = Conv.convert(Src, Dst, true);
+  std::error_code EC = Conv.convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
   Dst.clear();
 
   // ABC string.
   Src = ABCStrE;
-  EC = Conv.convert(Src, Dst, true);
+  EC = Conv.convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
   Dst.clear();
 
   // Accent string.
   Src = AccentE;
-  EC = Conv.convert(Src, Dst, true);
+  EC = Conv.convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
 }
@@ -144,11 +142,11 @@ TEST(CharSet, RoundTrip) {
 
   SmallString<99> Dst1Str, Dst2Str, Dst3Str;
 
-  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str);
   EXPECT_TRUE(!EC);
-  EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str);
   EXPECT_TRUE(!EC);
-  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
 }
@@ -168,7 +166,7 @@ TEST(CharSet, ShiftState2022) {
   }
 
   // Check that the string is properly converted.
-  std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+  std::error_code EC = ConvTo2022->convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
 }
@@ -188,7 +186,7 @@ TEST(CharSet, ShiftStateIBM939) {
   }
 
   // Check that the string is properly converted.
-  std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+  std::error_code EC = ConvToIBM939->convert(Src, Dst);
   EXPECT_TRUE(!EC);
   EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
 }

>From f1d4e8ee37437fec4c97d5ba9b31c6298348878b Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree at users.noreply.github.com>
Date: Wed, 1 May 2024 09:17:27 -0400
Subject: [PATCH 14/19] formatting nits

Remove comment that looks like code (unique_ptr should be easy enough to understand).

Co-authored-by: Hubert Tong <hubert-reinterpretcast at users.noreply.github.com>
---
 llvm/cmake/config-ix.cmake              |  4 ++--
 llvm/include/llvm/Config/config.h.cmake |  2 +-
 llvm/include/llvm/Support/CharSet.h     | 16 +++++++---------
 llvm/lib/Support/CMakeLists.txt         |  2 +-
 llvm/lib/Support/CharSet.cpp            |  9 ++++-----
 5 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 492941bf32021f..b503a337f488ab 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -272,14 +272,14 @@ if(LLVM_HAS_LOGF128)
   set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
 endif()
 
-#Check for icu.
+# Check for ICU.
 if(LLVM_ENABLE_ICU)
   set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
   set(CMAKE_FIND_LIBRARY_SUFFIXES ".so")
   if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
     find_package(ICU REQUIRED COMPONENTS uc i18n)
     if (NOT ICU_FOUND)
-      message(FATAL_ERROR "Failed to configure icu, but LLVM_ENABLE_ICU is FORCE_ON")
+      message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
     endif()
   else()
     find_package(ICU COMPONENTS uc i18n)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 77d352fc50e77e..ca82e1c8a30b12 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -279,7 +279,7 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
-/* Define if icu library is available */
+/* Define if ICU library is available */
 #cmakedefine HAVE_ICU ${HAVE_ICU}
 
 /* Define if iconv library is available */
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index c1089b744ef764..0d789e3ab637a0 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -46,12 +46,12 @@ class CharSetConverterImplBase {
   ///   - std::errc::invalid_argument: The input contains an incomplete
   ///     multibyte sequence.
   ///
+  /// If the destination charset is a stateful character set, the shift state
+  /// will be set to the initial state.
+  ///
   /// In case of an error, the result string contains the successfully converted
   /// part of the input string.
   ///
-  /// If the destination charset is a stateful character set, the shift state
-  /// will be set to the initial state.
-
   virtual std::error_code convert(StringRef Source,
                                   SmallVectorImpl<char> &Result) const = 0;
 };
@@ -70,7 +70,6 @@ enum class id {
 
 /// Utility class to convert between different character set encodings.
 class CharSetConverter {
-  // details::CharSetConverterImplBase *Converter;
   std::unique_ptr<details::CharSetConverterImplBase> Converter;
 
   CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
@@ -78,8 +77,8 @@ class CharSetConverter {
 
 public:
   /// Creates a CharSetConverter instance.
-  /// \param[in] CSFrom name of the source character encoding
-  /// \param[in] CSTo name of the target character encoding
+  /// \param[in] CSFrom the source character encoding
+  /// \param[in] CSTo the target character encoding
   /// \return a CharSetConverter instance
   static CharSetConverter create(text_encoding::id CSFrom,
                                  text_encoding::id CSTo);
@@ -95,9 +94,8 @@ class CharSetConverter {
   CharSetConverter(const CharSetConverter &) = delete;
   CharSetConverter &operator=(const CharSetConverter &) = delete;
 
-  CharSetConverter(CharSetConverter &&Other) {
-    Converter = std::move(Other.Converter);
-  }
+  CharSetConverter(CharSetConverter &&Other)
+      : Converter(std::move(Other.Converter)) {}
 
   CharSetConverter &operator=(CharSetConverter &&Other) {
     if (this != &Other)
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index ce506b9b226da3..f7284361903766 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -311,7 +311,7 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
-# Link icu library if it is an external library.
+# Link ICU library if it is an external library.
 if(ICU_FOUND)
   target_link_libraries(LLVMSupport
   PRIVATE
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index c00a1894e91708..2b984582794f97 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -8,7 +8,7 @@
 ///
 /// \file
 /// This file provides utility classes to convert between different character
-/// set encoding.
+/// set encodings.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -80,7 +80,7 @@ enum ConversionType {
   IBM1047ToUTF,
 };
 
-// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// Support conversion between EBCDIC 1047 and UTF-8. This class uses
 // built-in translation tables that allow for translation between the
 // aforementioned character sets. The use of tables for conversion is only
 // possible because EBCDIC 1047 is a single-byte, stateless encoding; other
@@ -166,9 +166,8 @@ CharSetConverterICU::convert(StringRef Source,
           Capacity < std::numeric_limits<size_t>::max()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
         continue;
-      } else
-        // Some other error occured.
-        return std::error_code(EILSEQ, std::generic_category());
+      // Some other error occured.
+      return std::error_code(EILSEQ, std::generic_category());
     }
     break;
   } while (true);

>From 0139c9e1b1957366f3b394adbdc95db8de843106 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 24 May 2024 11:43:45 -0400
Subject: [PATCH 15/19] Refactor ICU code

---
 llvm/lib/Support/CharSet.cpp | 81 ++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 2b984582794f97..0f88d0b9056c74 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -76,8 +76,8 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
 
 namespace {
 enum ConversionType {
-  UTFToIBM1047,
-  IBM1047ToUTF,
+  UTF8ToIBM1047,
+  IBM1047ToUTF8,
 };
 
 // Support conversion between EBCDIC 1047 and UTF-8. This class uses
@@ -98,10 +98,10 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 std::error_code
 CharSetConverterTable::convert(StringRef Source,
                                SmallVectorImpl<char> &Result) const {
-  if (ConvType == IBM1047ToUTF) {
+  if (ConvType == IBM1047ToUTF8) {
     ConverterEBCDIC::convertToUTF8(Source, Result);
     return std::error_code();
-  } else if (ConvType == UTFToIBM1047) {
+  } else if (ConvType == UTF8ToIBM1047) {
     return ConverterEBCDIC::convertToEBCDIC(Source, Result);
   }
   llvm_unreachable("Invalid ConvType!");
@@ -109,29 +109,23 @@ CharSetConverterTable::convert(StringRef Source,
 }
 
 #ifdef HAVE_ICU
+struct UConverterDeleter {
+  void operator()(UConverter *Converter) const {
+    if (Converter)
+      ucnv_close(Converter);
+  }
+};
+using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
+
 class CharSetConverterICU : public details::CharSetConverterImplBase {
-  UConverter *FromConvDesc;
-  UConverter *ToConvDesc;
+  UConverterUniquePtr FromConvDesc;
+  UConverterUniquePtr ToConvDesc;
 
 public:
-  CharSetConverterICU(UConverter *Converter) {
-    UErrorCode EC = U_ZERO_ERROR;
-    FromConvDesc = nullptr;
-    ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
-    if (U_FAILURE(EC)) {
-      ToConvDesc = nullptr;
-    }
-  };
-
-  CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
-    UErrorCode EC = U_ZERO_ERROR;
-    FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
-    if (U_FAILURE(EC))
-      FromConvDesc = nullptr;
-    ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
-    if (U_FAILURE(EC))
-      ToConvDesc = nullptr;
-  }
+  CharSetConverterICU(UConverterUniquePtr FromConverter,
+                      UConverterUniquePtr ToConverter)
+      : FromConvDesc(std::move(FromConverter)),
+        ToConvDesc(std::move(ToConverter)) {}
 
   std::error_code convert(StringRef Source,
                           SmallVectorImpl<char> &Result) const override;
@@ -140,24 +134,23 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
 std::error_code
 CharSetConverterICU::convert(StringRef Source,
                              SmallVectorImpl<char> &Result) const {
+  // Setup the input in case it has no backing data.
+  size_t InputLength = Source.size();
+  const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
+
   // Setup the output. We directly write into the SmallVector.
   size_t Capacity = Result.capacity();
   size_t OutputLength = Capacity;
-  char *Output, *Out;
   Result.resize_for_overwrite(Capacity);
-
+  char *Output = static_cast<char *>(Result.data());
   UErrorCode EC = U_ZERO_ERROR;
-
   do {
     EC = U_ZERO_ERROR;
-    size_t InputLength = Source.size();
-    const char *Input =
-        InputLength ? const_cast<char *>(Source.data()) : nullptr;
-    const char *In = Input;
-    Output = static_cast<char *>(Result.data());
-    Out = Output;
-    ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
-                   &Input, In + InputLength, /*pivotStart=*/NULL,
+    const char *Input = In;
+
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
+                   In + InputLength, /*pivotStart=*/NULL,
                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
                    /*pivotLimit=*/NULL, /*reset=*/true,
                    /*flush=*/true, &EC);
@@ -166,13 +159,14 @@ CharSetConverterICU::convert(StringRef Source,
           Capacity < std::numeric_limits<size_t>::max()) {
         HandleOverflow(Capacity, Output, OutputLength, Result);
         continue;
+      }
       // Some other error occured.
       return std::error_code(EILSEQ, std::generic_category());
     }
     break;
   } while (true);
 
-  Result.resize(Output - Out);
+  Result.resize(Output - Result.data());
   return std::error_code();
 }
 
@@ -247,9 +241,13 @@ CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
 
   ConversionType Conversion;
   if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
-    Conversion = UTFToIBM1047;
+    Conversion = UTF8ToIBM1047;
+  else if (CPFrom == text_encoding::id::IBM1047 &&
+           CPTo == text_encoding::id::UTF8)
+    Conversion = IBM1047ToUTF8;
   else
-    Conversion = IBM1047ToUTF;
+    assert(false &&
+           "Only conversions between UTF-8 and IBM-1047 are supported");
   std::unique_ptr<details::CharSetConverterImplBase> Converter =
       std::make_unique<CharSetConverterTable>(Conversion);
 
@@ -264,16 +262,17 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
     return create(*From, *To);
 #ifdef HAVE_ICU
   UErrorCode EC = U_ZERO_ERROR;
-  UConverter *FromConvDesc = ucnv_open(CSFrom.str().c_str(), &EC);
+  UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
   if (U_FAILURE(EC)) {
     return std::error_code(errno, std::generic_category());
   }
-  UConverter *ToConvDesc = ucnv_open(CSTo.str().c_str(), &EC);
+  UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC));
   if (U_FAILURE(EC)) {
     return std::error_code(errno, std::generic_category());
   }
   std::unique_ptr<details::CharSetConverterImplBase> Converter =
-      std::make_unique<CharSetConverterICU>(FromConvDesc, ToConvDesc);
+      std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
+                                            std::move(ToConvDesc));
   return CharSetConverter(std::move(Converter));
 #elif defined(HAVE_ICONV)
   iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());

>From 2a8da8e1481e05e894e895c7c8adaa57a5954317 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 29 May 2024 13:25:13 -0400
Subject: [PATCH 16/19] refactor iconv

---
 llvm/lib/Support/CharSet.cpp | 65 +++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 0f88d0b9056c74..8b89eb5c1129d3 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -172,10 +172,34 @@ CharSetConverterICU::convert(StringRef Source,
 
 #elif defined(HAVE_ICONV)
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
-  iconv_t ConvDesc;
+  class UniqueIconvT {
+    iconv_t ConvDesc;
+
+  public:
+    operator iconv_t() const { return ConvDesc; }
+    UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
+    ~UniqueIconvT() {
+      if (ConvDesc != (iconv_t)-1) {
+        iconv_close(ConvDesc);
+        ConvDesc = (iconv_t)-1;
+      }
+    }
+    UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
+      Other.ConvDesc = (iconv_t)-1;
+    }
+    UniqueIconvT &operator=(UniqueIconvT &&Other) {
+      if (&Other != this) {
+        ConvDesc = Other.ConvDesc;
+        Other.ConvDesc = (iconv_t)-1;
+      }
+      return *this;
+    }
+  };
+  UniqueIconvT ConvDesc;
 
 public:
-  CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+  CharSetConverterIconv(UniqueIconvT ConvDesc)
+      : ConvDesc(std::move(ConvDesc)) {}
 
   std::error_code convert(StringRef Source,
                           SmallVectorImpl<char> &Result) const override;
@@ -184,19 +208,16 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
 std::error_code
 CharSetConverterIconv::convert(StringRef Source,
                                SmallVectorImpl<char> &Result) const {
-  // Setup the input. Use nullptr to reset iconv state if input length is zero.
-  size_t InputLength = Source.size();
-  char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
   // Setup the output. We directly write into the SmallVector.
-  Result.resize_for_overwrite(Source.size());
   size_t Capacity = Result.capacity();
   char *Output = static_cast<char *>(Result.data());
   size_t OutputLength = Capacity;
+  Result.resize_for_overwrite(Capacity);
 
   size_t Ret;
-
   // Handle errors returned from iconv().
-  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
+                      this](size_t Ret) {
     if (Ret == static_cast<size_t>(-1)) {
       // An error occured. Check if we can gracefully handle it.
       if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
@@ -217,14 +238,26 @@ CharSetConverterIconv::convert(StringRef Source,
     }
   };
 
-  // Convert the string.
-  while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
-    if (auto EC = HandleError(Ret))
-      return EC;
-  // Flush the converter
-  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
-    if (auto EC = HandleError(Ret))
-      return EC;
+  do {
+    // Setup the input. Use nullptr to reset iconv state if input length is
+    // zero.
+    size_t InputLength = Source.size();
+    char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
+    if (Ret != 0) {
+      if (auto EC = HandleError(Ret))
+        return EC;
+      continue;
+    }
+    // Flush the converter
+    Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
+    if (Ret != 0) {
+      if (auto EC = HandleError(Ret))
+        return EC;
+      continue;
+    }
+    break;
+  } while (true);
 
   // Re-adjust size to actual size.
   Result.resize(Capacity - OutputLength);

>From aad2f4cf052924f295717e78a96333315c93ef35 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 31 May 2024 15:22:14 -0400
Subject: [PATCH 17/19] resize output if error

---
 llvm/lib/Support/CharSet.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 8b89eb5c1129d3..ce8cbc217e552b 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -161,6 +161,7 @@ CharSetConverterICU::convert(StringRef Source,
         continue;
       }
       // Some other error occured.
+      Result.resize(Output - Result.data());
       return std::error_code(EILSEQ, std::generic_category());
     }
     break;
@@ -227,6 +228,7 @@ CharSetConverterIconv::convert(StringRef Source,
         return std::error_code();
       } else {
         // Some other error occured.
+        Result.resize(Output - Result.data());
         return std::error_code(errno, std::generic_category());
       }
     } else {

>From 72926e6c97ce7a189be21acc4c1dbde49c519898 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 6 Jan 2025 10:39:05 -0500
Subject: [PATCH 18/19] address some comments

---
 llvm/cmake/config-ix.cmake                   |  2 +-
 llvm/include/llvm/Support/CharSet.h          | 18 ++++++--
 llvm/lib/Support/CharSet.cpp                 | 47 +++++++++++++-------
 llvm/unittests/Support/ConvertEBCDICTest.cpp |  4 +-
 4 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index b503a337f488ab..052fd267505f94 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -288,7 +288,7 @@ if(LLVM_ENABLE_ICU)
   set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
 endif()
 
-# Check for iconv.
+# Check for builtin iconv to avoid licensing issues.
 if(LLVM_ENABLE_ICONV)
   if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
     find_package(Iconv REQUIRED)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 0d789e3ab637a0..a0c9ba36a0f47f 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -33,6 +33,9 @@ class CharSetConverterImplBase {
 public:
   virtual ~CharSetConverterImplBase() = default;
 
+  /// Resets the converter to the initial state.
+  virtual void reset() = 0;
+
   /// Converts a string.
   /// \param[in] Source source string
   /// \param[out] Result container for converted string
@@ -52,8 +55,12 @@ class CharSetConverterImplBase {
   /// In case of an error, the result string contains the successfully converted
   /// part of the input string.
   ///
-  virtual std::error_code convert(StringRef Source,
-                                  SmallVectorImpl<char> &Result) const = 0;
+
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const;
+
+  virtual std::error_code convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) = 0;
 };
 } // namespace details
 
@@ -111,12 +118,15 @@ class CharSetConverter {
   /// \return error code in case something went wrong
   std::error_code convert(StringRef Source,
                           SmallVectorImpl<char> &Result) const {
-    return Converter->convert(Source, Result);
+    auto EC = Converter->convertString(Source, Result);
+    Converter->reset();
+    return EC;
   }
 
   ErrorOr<std::string> convert(StringRef Source) const {
     SmallString<100> Result;
-    auto EC = Converter->convert(Source, Result);
+    auto EC = Converter->convertString(Source, Result);
+    Converter->reset();
     if (!EC)
       return std::string(Result);
     return EC;
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index ce8cbc217e552b..1ec91975d8159b 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -91,13 +91,15 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
 public:
   CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
 
-  std::error_code convert(StringRef Source,
-                          SmallVectorImpl<char> &Result) const override;
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override {}
 };
 
 std::error_code
-CharSetConverterTable::convert(StringRef Source,
-                               SmallVectorImpl<char> &Result) const {
+CharSetConverterTable::convertString(StringRef Source,
+                                     SmallVectorImpl<char> &Result) {
   if (ConvType == IBM1047ToUTF8) {
     ConverterEBCDIC::convertToUTF8(Source, Result);
     return std::error_code();
@@ -127,13 +129,15 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
       : FromConvDesc(std::move(FromConverter)),
         ToConvDesc(std::move(ToConverter)) {}
 
-  std::error_code convert(StringRef Source,
-                          SmallVectorImpl<char> &Result) const override;
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
 };
 
 std::error_code
-CharSetConverterICU::convert(StringRef Source,
-                             SmallVectorImpl<char> &Result) const {
+CharSetConverterICU::convertString(StringRef Source,
+                                   SmallVectorImpl<char> &Result) {
   // Setup the input in case it has no backing data.
   size_t InputLength = Source.size();
   const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
@@ -171,6 +175,11 @@ CharSetConverterICU::convert(StringRef Source,
   return std::error_code();
 }
 
+void CharSetConverterICU::reset() {
+  ucnv_reset(&*FromConvDesc);
+  ucnv_reset(&*ToConvDesc);
+}
+
 #elif defined(HAVE_ICONV)
 class CharSetConverterIconv : public details::CharSetConverterImplBase {
   class UniqueIconvT {
@@ -202,13 +211,15 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
   CharSetConverterIconv(UniqueIconvT ConvDesc)
       : ConvDesc(std::move(ConvDesc)) {}
 
-  std::error_code convert(StringRef Source,
-                          SmallVectorImpl<char> &Result) const override;
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
 };
 
 std::error_code
-CharSetConverterIconv::convert(StringRef Source,
-                               SmallVectorImpl<char> &Result) const {
+CharSetConverterIconv::convertString(StringRef Source,
+                                     SmallVectorImpl<char> &Result) {
   // Setup the output. We directly write into the SmallVector.
   size_t Capacity = Result.capacity();
   char *Output = static_cast<char *>(Result.data());
@@ -262,10 +273,14 @@ CharSetConverterIconv::convert(StringRef Source,
   } while (true);
 
   // Re-adjust size to actual size.
-  Result.resize(Capacity - OutputLength);
+  Result.resize(Output - Result.data());
   return std::error_code();
 }
 
+void CharSetConverterIconv::reset() {
+  iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+}
+
 #endif // HAVE_ICONV
 } // namespace
 
@@ -281,8 +296,7 @@ CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
            CPTo == text_encoding::id::UTF8)
     Conversion = IBM1047ToUTF8;
   else
-    assert(false &&
-           "Only conversions between UTF-8 and IBM-1047 are supported");
+    llvm_unreachable("Invalid ConversionType!");
   std::unique_ptr<details::CharSetConverterImplBase> Converter =
       std::make_unique<CharSetConverterTable>(Conversion);
 
@@ -316,6 +330,7 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
   std::unique_ptr<details::CharSetConverterImplBase> Converter =
       std::make_unique<CharSetConverterIconv>(ConvDesc);
   return CharSetConverter(std::move(Converter));
-#endif
+#else
   return std::make_error_code(std::errc::invalid_argument);
+#endif
 }
diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp
index eec76879ac92c5..557f29c391f9cb 100644
--- a/llvm/unittests/Support/ConvertEBCDICTest.cpp
+++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp
@@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
 // String with Cyrillic character ya.
 static const char CyrillicUTF[] = "\xd0\xaf";
 
-TEST(CharSet, FromUTF8) {
+TEST(ConverterEBCDIC, convertToEBCDIC) {
   // Hello string.
   StringRef Src(HelloA);
   SmallString<64> Dst;
@@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) {
   Dst.clear();
 }
 
-TEST(CharSet, ToUTF8) {
+TEST(ConverterEBCDIC, convertFromEBCDIC) {
   // Hello string.
   StringRef Src(HelloE);
   SmallString<64> Dst;

>From fd68bebafbdb0ce23b779048759b75fbf86e9fdf Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 13 Jan 2025 10:12:01 -0500
Subject: [PATCH 19/19] add callback function to properly report errors

---
 llvm/lib/Support/CharSet.cpp           |  6 ++++++
 llvm/unittests/Support/CharSetTest.cpp | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 1ec91975d8159b..509cd8209ff06d 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -148,6 +148,12 @@ CharSetConverterICU::convertString(StringRef Source,
   Result.resize_for_overwrite(Capacity);
   char *Output = static_cast<char *>(Result.data());
   UErrorCode EC = U_ZERO_ERROR;
+
+  ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+                      &EC);
+  ucnv_setToUCallBack(&*ToConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+                      &EC);
+
   do {
     EC = U_ZERO_ERROR;
     const char *Input = In;
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 579e21a86e18e5..f68411690bc8ff 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -50,6 +50,8 @@ static const char EarthISO2022[] =
     "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
 static const char EarthIBM939[] =
     "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char EarthUTFExtraPartial[] =
+    "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
 
 TEST(CharSet, FromUTF8) {
   // Hello string.
@@ -171,6 +173,25 @@ TEST(CharSet, ShiftState2022) {
   EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
 }
 
+TEST(CharSet, ShiftState2022Partial) {
+  // Earth string.
+  StringRef Src(EarthUTFExtraPartial);
+  SmallString<8> Dst;
+
+  ErrorOr<CharSetConverter> ConvTo2022 =
+      CharSetConverter::create("UTF-8", "ISO-2022-JP");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvTo2022) {
+    ASSERT_EQ(ConvTo2022.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Check that the string is properly converted.
+  std::error_code EC = ConvTo2022->convert(Src, Dst);
+  EXPECT_TRUE(EC);
+}
+
 TEST(CharSet, ShiftStateIBM939) {
   // Earth string.
   StringRef Src(EarthUTF);