[llvm] Create a CharSetConverter class with both iconv and icu support (PR #74516)
Abhina Sree via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 07:12:28 PST 2025
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/74516
>From bbf1cd20b1236fc4d4fffb19925382a2a4f33720 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 5 Dec 2023 15:08:43 -0500
Subject: [PATCH 01/19] Create a CharSetConverter class with both iconv and icu
support.
---
llvm/cmake/config-ix.cmake | 16 +
llvm/include/llvm/Config/config.h.cmake | 6 +
llvm/include/llvm/Support/CharSet.h | 160 ++++++++++
llvm/lib/Support/CMakeLists.txt | 17 ++
llvm/lib/Support/CharSet.cpp | 370 ++++++++++++++++++++++++
llvm/unittests/Support/CMakeLists.txt | 1 +
llvm/unittests/Support/CharSetTest.cpp | 281 ++++++++++++++++++
7 files changed, 851 insertions(+)
create mode 100644 llvm/include/llvm/Support/CharSet.h
create mode 100644 llvm/lib/Support/CharSet.cpp
create mode 100644 llvm/unittests/Support/CharSetTest.cpp
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 64878d28d9e1e5..0504a5b2d742ba 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -272,6 +272,22 @@ if(LLVM_HAS_LOGF128)
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
endif()
+#Check for icu.
+find_package(ICU COMPONENTS uc i18n)
+if(ICU_FOUND)
+ set(HAVE_ICU 1)
+else()
+ set(HAVE_ICU 0)
+endif()
+
+# Check for iconv.
+find_package(Iconv)
+if(Iconv_FOUND)
+ set(HAVE_ICONV 1)
+else()
+ set(HAVE_ICONV 0)
+endif()
+
# function checks
check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 3e6b94dfbe5458..77d352fc50e77e 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -279,6 +279,12 @@
/* Have host's ___chkstk_ms */
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
+/* Define if icu library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
/* Linker version detected at compile time. */
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 00000000000000..856b3be65ff7ed
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,160 @@
+//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <functional>
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+public:
+ virtual ~CharSetConverterImplBase() = default;
+
+ /// Converts a string.
+ /// \param[in] Source source string
+ /// \param[in,out] Result container for converted string
+ /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+ /// for multi-byte encodings iff true.
+ /// \return error code in case something went wrong
+ ///
+ /// The following error codes can occur, among others:
+ /// - std::errc::argument_list_too_long: The result requires more than
+ /// std::numeric_limits<size_t>::max() bytes.
+ /// - std::errc::illegal_byte_sequence: The input contains an invalid
+ /// multibyte sequence.
+ /// - std::errc::invalid_argument: The input contains an incomplete
+ /// multibyte sequence.
+ ///
+ /// In case of an error, the result string contains the successfully converted
+ /// part of the input string.
+ ///
+
+ virtual std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const = 0;
+
+ /// Restore the conversion to the original state.
+ /// \return error code in case something went wrong
+ ///
+ /// If the original character set or the destination character set
+ /// are multi-byte character sets, set the shift state to the initial
+ /// state. Otherwise this is a no-op.
+ virtual std::error_code flush() const = 0;
+
+ virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+ /// UTF-8 character set encoding.
+ UTF8,
+
+ /// IBM EBCDIC 1047 character set encoding.
+ IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+class CharSetConverter {
+ // details::CharSetConverterImplBase *Converter;
+ std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+ CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+ : Converter(std::move(Converter)) {}
+
+public:
+ /// Creates a CharSetConverter instance.
+ /// \param[in] CSFrom name of the source character encoding
+ /// \param[in] CSTo name of the target character encoding
+ /// \return a CharSetConverter instance
+ static CharSetConverter create(text_encoding::id CSFrom,
+ text_encoding::id CSTo);
+
+ /// Creates a CharSetConverter instance.
+ /// Returns std::errc::invalid_argument in case the requested conversion is
+ /// not supported.
+ /// \param[in] CPFrom name of the source character encoding
+ /// \param[in] CPTo name of the target character encoding
+ /// \return a CharSetConverter instance or an error code
+ static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+ CharSetConverter(const CharSetConverter &) = delete;
+ CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+ CharSetConverter(CharSetConverter &&Other) {
+ Converter = std::move(Other.Converter);
+ }
+
+ CharSetConverter &operator=(CharSetConverter &&Other) {
+ if (this != &Other)
+ Converter = std::move(Other.Converter);
+ return *this;
+ }
+
+ ~CharSetConverter() = default;
+
+ /// Converts a string.
+ /// \param[in] Source source string
+ /// \param[in,out] Result container for converted string
+ /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+ /// for multi-byte encodings.
+ /// \return error code in case something went wrong
+ std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush = true) const {
+ return Converter->convert(Source, Result, ShouldAutoFlush);
+ }
+
+ char convert(char SingleChar) const {
+ SmallString<1> Result;
+ Converter->convert(StringRef(&SingleChar, 1), Result, false);
+ return Result[0];
+ }
+
+ /// Converts a string.
+ /// \param[in] Source source string
+ /// \param[in,out] Result container for converted string
+ /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+ /// for multi-byte encodings iff true.
+ /// \return error code in case something went wrong
+ std::error_code convert(const std::string &Source,
+ SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush = true) const {
+ return convert(StringRef(Source), Result, ShouldAutoFlush);
+ }
+
+ std::error_code flush() const { return Converter->flush(); }
+
+ std::error_code flush(SmallVectorImpl<char> &Result) const {
+ return Converter->flush(Result);
+ }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 2ecaea4b02bf61..87543eb66f75b9 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -160,6 +160,7 @@ add_llvm_component_library(LLVMSupport
CachePruning.cpp
Caching.cpp
circular_raw_ostream.cpp
+ CharSet.cpp
Chrono.cpp
COM.cpp
CodeGenCoverage.cpp
@@ -310,6 +311,22 @@ add_llvm_component_library(LLVMSupport
Demangle
)
+# Link icu library if it is an external library.
+if(ICU_FOUND)
+ target_link_libraries(LLVMSupport
+ PRIVATE
+ ${ICU_LIBRARIES}
+ )
+else()
+ # Link iconv library if it is an external library.
+ if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+ target_link_libraries(LLVMSupport
+ PRIVATE
+ ${Iconv_LIBRARIES}
+ )
+ endif()
+endif()
+
set(llvm_system_libs ${system_libs})
# This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 00000000000000..dbc2cb7c1839d2
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,370 @@
+//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+ bool PrevDigit = false;
+ for (auto Ch : CSName) {
+ if (isAlnum(Ch)) {
+ Ch = toLower(Ch);
+ if (Ch != '0' || PrevDigit) {
+ PrevDigit = isDigit(Ch);
+ Normalized.push_back(Ch);
+ }
+ }
+ }
+}
+
+// Maps the charset name to enum constant if possible.
+std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+ SmallString<16> Normalized;
+ normalizeCharSetName(CSName, Normalized);
+#define CSNAME(CS, STR) \
+ if (Normalized.equals(STR)) \
+ return CS
+ CSNAME(text_encoding::id::UTF8, "utf8");
+ CSNAME(text_encoding::id::IBM1047, "ibm1047");
+#undef CSNAME
+ return std::nullopt;
+}
+
+namespace {
+enum ConversionType {
+ UTFToIBM1047,
+ IBM1047ToUTF,
+};
+
+// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+ ConversionType ConvType;
+
+public:
+ CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+ std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const override;
+ std::error_code flush() const override;
+ std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterTable::convert(StringRef Source,
+ SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const {
+ if (ConvType == IBM1047ToUTF) {
+ ConverterEBCDIC::convertToUTF8(Source, Result);
+ return std::error_code();
+ } else if (ConvType == UTFToIBM1047) {
+ return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+ }
+ llvm_unreachable("Invalid ConvType!");
+ return std::error_code();
+}
+
+std::error_code CharSetConverterTable::flush() const {
+ return std::error_code();
+}
+
+std::error_code
+CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
+ return std::error_code();
+}
+
+#ifdef HAVE_ICU
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+ UConverter *FromConvDesc;
+ UConverter *ToConvDesc;
+
+public:
+ CharSetConverterICU(UConverter *Converter) {
+ UErrorCode EC = U_ZERO_ERROR;
+ FromConvDesc = nullptr;
+ ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
+ if (U_FAILURE(EC)) {
+ ToConvDesc = nullptr;
+ }
+ };
+
+ CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
+ UErrorCode EC = U_ZERO_ERROR;
+ FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
+ if (U_FAILURE(EC))
+ FromConvDesc = nullptr;
+ ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
+ if (U_FAILURE(EC))
+ ToConvDesc = nullptr;
+ }
+
+ std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const override;
+ std::error_code flush() const override;
+ std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterICU::convert(StringRef Source,
+ SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const {
+ // Setup the output. We directly write into the SmallVector.
+ size_t OutputLength, Capacity = Result.capacity();
+ char *Output, *Out;
+
+ UErrorCode EC = U_ZERO_ERROR;
+
+ auto HandleError = [&Capacity, &Output, &OutputLength,
+ &Result](UErrorCode UEC) {
+ if (UEC == U_BUFFER_OVERFLOW_ERROR &&
+ Capacity < std::numeric_limits<size_t>::max()) {
+ // No space left in output buffer. Double the size of the underlying
+ // memory in the SmallVectorImpl, adjust pointer and length and continue
+ // the conversion.
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+ ? 2 * Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data());
+ OutputLength = Capacity;
+ return std::error_code();
+ } else {
+ // Some other error occured.
+ return std::error_code(errno, std::generic_category());
+ }
+ };
+
+ do {
+ EC = U_ZERO_ERROR;
+ size_t InputLength = Source.size();
+ const char *Input =
+ InputLength ? const_cast<char *>(Source.data()) : nullptr;
+ const char *In = Input;
+ Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ OutputLength = Capacity;
+ Out = Output;
+ Result.resize_for_overwrite(Capacity);
+ ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
+ &Input, In + InputLength, /*pivotStart=*/NULL,
+ /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+ /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+ if (U_FAILURE(EC)) {
+ if (auto error = HandleError(EC))
+ return error;
+ } else if (U_SUCCESS(EC))
+ break;
+ } while (U_FAILURE(EC));
+
+ Result.resize(Output - Out);
+ return std::error_code();
+}
+
+std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
+
+std::error_code
+CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
+ return std::error_code();
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+ iconv_t ConvDesc;
+
+public:
+ CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+ std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const override;
+ std::error_code flush() const override;
+ std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+ SmallVectorImpl<char> &Result,
+ bool ShouldAutoFlush) const {
+ // Setup the input. Use nullptr to reset iconv state if input length is zero.
+ size_t InputLength = Source.size();
+ char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+ // Setup the output. We directly write into the SmallVector.
+ size_t Capacity = Result.capacity();
+ Result.resize_for_overwrite(Capacity);
+ char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ size_t OutputLength = Capacity;
+
+ size_t Ret;
+
+ // Handle errors returned from iconv().
+ auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+ if (Ret == static_cast<size_t>(-1)) {
+ // An error occured. Check if we can gracefully handle it.
+ if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+ // No space left in output buffer. Double the size of the underlying
+ // memory in the SmallVectorImpl, adjust pointer and length and continue
+ // the conversion.
+ const size_t Used = Capacity - OutputLength;
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+ ? 2 * Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data()) + Used;
+ OutputLength = Capacity - Used;
+ return std::error_code();
+ } else {
+ // Some other error occured.
+ return std::error_code(errno, std::generic_category());
+ }
+ } else {
+ // A positive return value indicates that some characters were converted
+ // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+ // an error in this case makes sure that both conversion routines behave
+ // in the same way.
+ return std::make_error_code(std::errc::illegal_byte_sequence);
+ }
+ };
+
+ // Convert the string.
+ while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
+ if (ShouldAutoFlush) {
+ while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
+ }
+
+ // Re-adjust size to actual size.
+ Result.resize(Capacity - OutputLength);
+ return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+ size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+ if (Ret == static_cast<size_t>(-1)) {
+ return std::error_code(errno, std::generic_category());
+ }
+ return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+ char *Output = Result.data();
+ size_t OutputLength = Result.capacity();
+ size_t Capacity = Result.capacity();
+ Result.resize_for_overwrite(Capacity);
+
+ // Handle errors returned from iconv().
+ auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+ if (Ret == static_cast<size_t>(-1)) {
+ // An error occured. Check if we can gracefully handle it.
+ if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+ // No space left in output buffer. Increase the size of the underlying
+ // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+ // and continue the conversion.
+ const size_t Used = Capacity - OutputLength;
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+ ? 2 + Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data()) + Used;
+ OutputLength = Capacity - Used;
+ return std::error_code();
+ } else {
+ // Some other error occured.
+ return std::error_code(errno, std::generic_category());
+ }
+ } else {
+ // A positive return value indicates that some characters were converted
+ // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+ // an error in this case makes sure that both conversion routines behave
+ // in the same way.
+ return std::make_error_code(std::errc::illegal_byte_sequence);
+ }
+ };
+
+ size_t Ret;
+ while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
+
+ // Re-adjust size to actual size.
+ Result.resize(Capacity - OutputLength);
+ return std::error_code();
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
+ text_encoding::id CPTo) {
+
+ assert(CPFrom != CPTo && "Text encodings should be distinct");
+
+ ConversionType Conversion;
+ if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+ Conversion = UTFToIBM1047;
+ else
+ Conversion = IBM1047ToUTF;
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterTable>(Conversion);
+
+ return CharSetConverter(std::move(Converter));
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+ StringRef CSTo) {
+ std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
+ std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+ if (From && To)
+ return create(*From, *To);
+#ifdef HAVE_ICU
+ UErrorCode EC = U_ZERO_ERROR;
+ UConverter *FromConvDesc = ucnv_open(CSFrom.str().c_str(), &EC);
+ if (U_FAILURE(EC)) {
+ return std::error_code(errno, std::generic_category());
+ }
+ UConverter *ToConvDesc = ucnv_open(CSTo.str().c_str(), &EC);
+ if (U_FAILURE(EC)) {
+ return std::error_code(errno, std::generic_category());
+ }
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterICU>(FromConvDesc, ToConvDesc);
+ return CharSetConverter(std::move(Converter));
+#elif defined(HAVE_ICONV)
+ iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+ if (ConvDesc == (iconv_t)-1)
+ return std::error_code(errno, std::generic_category());
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterIconv>(ConvDesc);
+ return CharSetConverter(std::move(Converter));
+#endif
+ return std::make_error_code(std::errc::invalid_argument);
+}
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 6de81658264420..2fc70604109a19 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(SupportTests
BalancedPartitioningTest.cpp
BranchProbabilityTest.cpp
CachePruningTest.cpp
+ CharSetTest.cpp
CrashRecoveryTest.cpp
Casting.cpp
CheckedArithmeticTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
new file mode 100644
index 00000000000000..2f2d8f97102b98
--- /dev/null
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,281 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+ "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+ "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+ "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+ "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+ "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+ "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+ "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+ "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+ "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+ "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+ "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+// Identical to above, except the final character (球) has its last byte taken
+// away from it.
+static const char EarthISO2022[] =
+ "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+ "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, FromUTF8) {
+ // Hello string.
+ StringRef Src(HelloA);
+ SmallString<64> Dst;
+
+ CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8,
+ text_encoding::id::IBM1047);
+ std::error_code EC = Conv.convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // ABC string.
+ Src = ABCStrA;
+ EC = Conv.convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // Accent string.
+ Src = AccentUTF;
+ EC = Conv.convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // Cyrillic string. Results in error because not representable in 1047.
+ Src = CyrillicUTF;
+ EC = Conv.convert(Src, Dst, true);
+ EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(CharSet, ToUTF8) {
+ // Hello string.
+ StringRef Src(HelloE);
+ SmallString<64> Dst;
+
+ CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047,
+ text_encoding::id::UTF8);
+ std::error_code EC = Conv.convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // ABC string.
+ Src = ABCStrE;
+ EC = Conv.convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // Accent string.
+ Src = AccentE;
+ EC = Conv.convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+ ErrorOr<CharSetConverter> ConvToUTF16 =
+ CharSetConverter::create("IBM-1047", "UTF-16");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToUTF16) {
+ ASSERT_EQ(ConvToUTF16.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+ ErrorOr<CharSetConverter> ConvToUTF32 =
+ CharSetConverter::create("UTF-16", "UTF-32");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToUTF32) {
+ ASSERT_EQ(ConvToUTF32.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+ ErrorOr<CharSetConverter> ConvToEBCDIC =
+ CharSetConverter::create("UTF-32", "IBM-1047");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToEBCDIC) {
+ ASSERT_EQ(ConvToEBCDIC.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Setup source string.
+ char SrcStr[256];
+ for (size_t I = 0; I < 256; ++I)
+ SrcStr[I] = (I + 1) % 256;
+
+ SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+ std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+ EXPECT_TRUE(!EC);
+ EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+ EXPECT_TRUE(!EC);
+ EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+ // Earth string.
+ StringRef Src(EarthUTF);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> ConvTo2022 =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvTo2022) {
+ ASSERT_EQ(ConvTo2022.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+ // Earth string.
+ StringRef Src(EarthUTF);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> ConvToIBM939 =
+ CharSetConverter::create("UTF-8", "IBM-939");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToIBM939) {
+ ASSERT_EQ(ConvToIBM939.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
+
+// Identical to EarthUTF, except the final character (球) has its last byte
+// taken away from it.
+static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
+static const char EarthISO2022ShiftBack[] =
+ "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
+static const char ShiftBackOnly[] = "\x1B\x28\x42";
+
+// String "地球".
+static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthKanjiOnlyISO2022[] =
+ "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
+static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
+
+TEST(CharSet, ShiftState2022Flush) {
+ StringRef Src0(EarthUTFBroken);
+ StringRef Src1(EarthKanjiOnlyUTF);
+ SmallString<64> Dst0;
+ SmallString<64> Dst1;
+ ErrorOr<CharSetConverter> ConvTo2022Flush =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ if (!ConvTo2022Flush) {
+ ASSERT_EQ(ConvTo2022Flush.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // This should emit an error; there is a malformed multibyte character in the
+ // input string.
+ std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
+ EXPECT_TRUE(EC0);
+ std::error_code EC1 = ConvTo2022Flush->flush();
+ EXPECT_TRUE(!EC1);
+ std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
+ EXPECT_TRUE(!EC2);
+ EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftStateIBM939Flush) {
+ StringRef Src0(EarthUTFBroken);
+ StringRef Src1(EarthKanjiOnlyUTF);
+ SmallString<64> Dst0;
+ SmallString<64> Dst1;
+ ErrorOr<CharSetConverter> ConvTo939Flush =
+ CharSetConverter::create("UTF-8", "IBM-939");
+ if (!ConvTo939Flush) {
+ ASSERT_EQ(ConvTo939Flush.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // This should emit an error; there is a malformed multibyte character in the
+ // input string.
+ std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
+ EXPECT_TRUE(EC0);
+ std::error_code EC1 = ConvTo939Flush->flush();
+ EXPECT_TRUE(!EC1);
+ std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
+ EXPECT_TRUE(!EC2);
+ EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
+}
+
+TEST(CharSet, ShiftState2022Flush1) {
+ StringRef Src0(EarthUTF);
+ SmallString<64> Dst0;
+ SmallString<64> Dst1;
+ ErrorOr<CharSetConverter> ConvTo2022Flush =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ if (!ConvTo2022Flush) {
+ ASSERT_EQ(ConvTo2022Flush.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
+ EXPECT_TRUE(!EC0);
+ EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
+ std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
+ EXPECT_TRUE(!EC1);
+ EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
+}
+
+#endif
+
+} // namespace
>From 1dbd35bb1893640b092a5fc16104537bf468d291 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 9 Jan 2024 14:47:24 -0500
Subject: [PATCH 02/19] address review comments
---
llvm/include/llvm/Support/CharSet.h | 38 ++++-----
llvm/lib/Support/CharSet.cpp | 116 ++++++++--------------------
2 files changed, 49 insertions(+), 105 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 856b3be65ff7ed..fd077191c235b5 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -1,4 +1,4 @@
-//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -35,9 +35,9 @@ class CharSetConverterImplBase {
/// Converts a string.
/// \param[in] Source source string
- /// \param[in,out] Result container for converted string
+ /// \param[out] Result container for converted string
/// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
- /// for multi-byte encodings iff true.
+ /// for stateful encodings if true.
/// \return error code in case something went wrong
///
/// The following error codes can occur, among others:
@@ -59,9 +59,9 @@ class CharSetConverterImplBase {
/// Restore the conversion to the original state.
/// \return error code in case something went wrong
///
- /// If the original character set or the destination character set
- /// are multi-byte character sets, set the shift state to the initial
- /// state. Otherwise this is a no-op.
+ /// If the destination character set is a stateful character set,
+ /// set the shift state to the initial state.
+ /// Otherwise this is a no-op.
virtual std::error_code flush() const = 0;
virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
@@ -80,7 +80,6 @@ enum class id {
} // end namespace text_encoding
/// Utility class to convert between different character set encodings.
-/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
class CharSetConverter {
// details::CharSetConverterImplBase *Converter;
std::unique_ptr<details::CharSetConverterImplBase> Converter;
@@ -121,33 +120,30 @@ class CharSetConverter {
/// Converts a string.
/// \param[in] Source source string
- /// \param[in,out] Result container for converted string
+ /// \param[out] Result container for converted string
/// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
- /// for multi-byte encodings.
+ /// for stateful encodings.
/// \return error code in case something went wrong
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush = true) const {
return Converter->convert(Source, Result, ShouldAutoFlush);
}
+ ErrorOr<std::string> convert(StringRef Source,
+ bool ShouldAutoFlush = true) const {
+ SmallString<1> Result;
+ auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
+ if (!EC)
+ return std::string(Result);
+ return EC;
+ }
+
char convert(char SingleChar) const {
SmallString<1> Result;
Converter->convert(StringRef(&SingleChar, 1), Result, false);
return Result[0];
}
- /// Converts a string.
- /// \param[in] Source source string
- /// \param[in,out] Result container for converted string
- /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
- /// for multi-byte encodings iff true.
- /// \return error code in case something went wrong
- std::error_code convert(const std::string &Source,
- SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush = true) const {
- return convert(StringRef(Source), Result, ShouldAutoFlush);
- }
-
std::error_code flush() const { return Converter->flush(); }
std::error_code flush(SmallVectorImpl<char> &Result) const {
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index dbc2cb7c1839d2..1a49d665fdbda0 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -1,4 +1,4 @@
-//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -32,7 +32,8 @@ using namespace llvm;
// Normalize the charset name with the charset alias matching algorithm proposed
// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
-void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+static void normalizeCharSetName(StringRef CSName,
+ SmallVectorImpl<char> &Normalized) {
bool PrevDigit = false;
for (auto Ch : CSName) {
if (isAlnum(Ch)) {
@@ -49,15 +50,26 @@ void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
SmallString<16> Normalized;
normalizeCharSetName(CSName, Normalized);
-#define CSNAME(CS, STR) \
- if (Normalized.equals(STR)) \
- return CS
- CSNAME(text_encoding::id::UTF8, "utf8");
- CSNAME(text_encoding::id::IBM1047, "ibm1047");
-#undef CSNAME
+ if (Normalized.equals("utf8"))
+ return text_encoding::id::UTF8;
+ if (Normalized.equals("ibm1047"))
+ return text_encoding::id::IBM1047;
return std::nullopt;
}
+void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+ SmallVectorImpl<char> &Result) {
+ // No space left in output buffer. Double the size of the underlying
+ // memory in the SmallVectorImpl, adjust pointer and length and continue
+ // the conversion.
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+ ? 2 * Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data());
+ OutputLength = Capacity;
+}
+
namespace {
enum ConversionType {
UTFToIBM1047,
@@ -138,31 +150,12 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const {
// Setup the output. We directly write into the SmallVector.
+ Result.resize_for_overwrite(Source.size());
size_t OutputLength, Capacity = Result.capacity();
char *Output, *Out;
UErrorCode EC = U_ZERO_ERROR;
- auto HandleError = [&Capacity, &Output, &OutputLength,
- &Result](UErrorCode UEC) {
- if (UEC == U_BUFFER_OVERFLOW_ERROR &&
- Capacity < std::numeric_limits<size_t>::max()) {
- // No space left in output buffer. Double the size of the underlying
- // memory in the SmallVectorImpl, adjust pointer and length and continue
- // the conversion.
- Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
- ? 2 * Capacity
- : std::numeric_limits<size_t>::max();
- Result.resize_for_overwrite(Capacity);
- Output = static_cast<char *>(Result.data());
- OutputLength = Capacity;
- return std::error_code();
- } else {
- // Some other error occured.
- return std::error_code(errno, std::generic_category());
- }
- };
-
do {
EC = U_ZERO_ERROR;
size_t InputLength = Source.size();
@@ -176,10 +169,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
&Input, In + InputLength, /*pivotStart=*/NULL,
/*pivotSource=*/NULL, /*pivotTarget=*/NULL,
- /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+ /*pivotLimit=*/NULL, /*reset=*/true,
+ /*flush=*/ShouldAutoFlush, &EC);
if (U_FAILURE(EC)) {
- if (auto error = HandleError(EC))
- return error;
+ if (EC == U_BUFFER_OVERFLOW_ERROR &&
+ Capacity < std::numeric_limits<size_t>::max())
+ HandleOverflow(Capacity, Output, OutputLength, Result);
+ else
+ // Some other error occured.
+ return std::error_code(errno, std::generic_category());
} else if (U_SUCCESS(EC))
break;
} while (U_FAILURE(EC));
@@ -215,8 +213,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
size_t InputLength = Source.size();
char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
// Setup the output. We directly write into the SmallVector.
+ Result.resize_for_overwrite(Source.size());
size_t Capacity = Result.capacity();
- Result.resize_for_overwrite(Capacity);
char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
size_t OutputLength = Capacity;
@@ -227,16 +225,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
if (Ret == static_cast<size_t>(-1)) {
// An error occured. Check if we can gracefully handle it.
if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
- // No space left in output buffer. Double the size of the underlying
- // memory in the SmallVectorImpl, adjust pointer and length and continue
- // the conversion.
- const size_t Used = Capacity - OutputLength;
- Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
- ? 2 * Capacity
- : std::numeric_limits<size_t>::max();
- Result.resize_for_overwrite(Capacity);
- Output = static_cast<char *>(Result.data()) + Used;
- OutputLength = Capacity - Used;
+ HandleOverflow(Capacity, Output, OutputLength, Result);
return std::error_code();
} else {
// Some other error occured.
@@ -276,48 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
std::error_code
CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
- char *Output = Result.data();
- size_t OutputLength = Result.capacity();
- size_t Capacity = Result.capacity();
- Result.resize_for_overwrite(Capacity);
-
- // Handle errors returned from iconv().
- auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
- if (Ret == static_cast<size_t>(-1)) {
- // An error occured. Check if we can gracefully handle it.
- if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
- // No space left in output buffer. Increase the size of the underlying
- // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
- // and continue the conversion.
- const size_t Used = Capacity - OutputLength;
- Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
- ? 2 + Capacity
- : std::numeric_limits<size_t>::max();
- Result.resize_for_overwrite(Capacity);
- Output = static_cast<char *>(Result.data()) + Used;
- OutputLength = Capacity - Used;
- return std::error_code();
- } else {
- // Some other error occured.
- return std::error_code(errno, std::generic_category());
- }
- } else {
- // A positive return value indicates that some characters were converted
- // in a nonreversible way, that is, replaced with a SUB symbol. Returning
- // an error in this case makes sure that both conversion routines behave
- // in the same way.
- return std::make_error_code(std::errc::illegal_byte_sequence);
- }
- };
-
- size_t Ret;
- while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
- if (auto EC = HandleError(Ret))
- return EC;
-
- // Re-adjust size to actual size.
- Result.resize(Capacity - OutputLength);
- return std::error_code();
+ return convert(nullptr, Result);
}
#endif // HAVE_ICONV
>From c864288e945da4d31c3387f2efae25d84b8f4d41 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 10 Jan 2024 09:52:43 -0500
Subject: [PATCH 03/19] add LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV option
---
llvm/CMakeLists.txt | 4 ++++
llvm/cmake/config-ix.cmake | 24 ++++++++++++++----------
2 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f14065ab037990..118817f19ff2fa 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -560,6 +560,10 @@ else()
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
endif()
+set(LLVM_ENABLE_ICU "ON" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "ON" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 0504a5b2d742ba..3d0eecb65e7089 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -273,19 +273,23 @@ if(LLVM_HAS_LOGF128)
endif()
#Check for icu.
-find_package(ICU COMPONENTS uc i18n)
-if(ICU_FOUND)
- set(HAVE_ICU 1)
-else()
- set(HAVE_ICU 0)
+if(LLVM_ENABLE_ICU)
+ find_package(ICU COMPONENTS uc i18n)
+ if(ICU_FOUND)
+ set(HAVE_ICU 1)
+ else()
+ set(HAVE_ICU 0)
+ endif()
endif()
# Check for iconv.
-find_package(Iconv)
-if(Iconv_FOUND)
- set(HAVE_ICONV 1)
-else()
- set(HAVE_ICONV 0)
+if(LLVM_ENABLE_ICONV)
+ find_package(Iconv)
+ if(Iconv_FOUND)
+ set(HAVE_ICONV 1)
+ else()
+ set(HAVE_ICONV 0)
+ endif()
endif()
# function checks
>From 710f7450fded614455ff2d7ed16f40c89119b343 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 31 Jan 2024 14:06:50 -0500
Subject: [PATCH 04/19] remove single char conversion function
---
llvm/include/llvm/Support/CharSet.h | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index fd077191c235b5..e573b3da9d7cc3 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -138,12 +138,6 @@ class CharSetConverter {
return EC;
}
- char convert(char SingleChar) const {
- SmallString<1> Result;
- Converter->convert(StringRef(&SingleChar, 1), Result, false);
- return Result[0];
- }
-
std::error_code flush() const { return Converter->flush(); }
std::error_code flush(SmallVectorImpl<char> &Result) const {
>From df4ee4370cd2831caf6688c94a01b0c2f25cd4c1 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 23 Feb 2024 13:35:34 -0500
Subject: [PATCH 05/19] handle FORCE_ON, look for shared libraries only for ICU
---
llvm/cmake/config-ix.cmake | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 3d0eecb65e7089..3366cdd733d700 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -274,22 +274,31 @@ endif()
#Check for icu.
if(LLVM_ENABLE_ICU)
- find_package(ICU COMPONENTS uc i18n)
- if(ICU_FOUND)
- set(HAVE_ICU 1)
+ set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".so")
+ if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+ find_package(ICU REQUIRED COMPONENTS uc i18n)
+ if (NOT ICU_FOUND)
+ message(FATAL_ERROR "Failed to configure icu, but LLVM_ENABLE_ICU is FORCE_ON")
+ endif()
else()
- set(HAVE_ICU 0)
+ find_package(ICU COMPONENTS uc i18n)
endif()
+ set(HAVE_ICU ${ICU_FOUND})
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
endif()
# Check for iconv.
if(LLVM_ENABLE_ICONV)
- find_package(Iconv)
- if(Iconv_FOUND)
- set(HAVE_ICONV 1)
+ if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+ find_package(Iconv REQUIRED)
+ if (NOT Iconv_FOUND)
+ message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+ endif()
else()
- set(HAVE_ICONV 0)
+ find_package(Iconv)
endif()
+ set(HAVE_ICONV ${Iconv_FOUND})
endif()
# function checks
>From 4bbf9b0d87ffb48fe955f03bf9e4aba68fcfc153 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 4 Apr 2024 13:45:13 -0400
Subject: [PATCH 06/19] only allow builtin iconv support
---
llvm/cmake/config-ix.cmake | 4 +++-
llvm/lib/Support/CMakeLists.txt | 8 --------
llvm/lib/Support/CharSet.cpp | 4 ++--
3 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 3366cdd733d700..eee54d92b78f4e 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -298,7 +298,9 @@ if(LLVM_ENABLE_ICONV)
else()
find_package(Iconv)
endif()
- set(HAVE_ICONV ${Iconv_FOUND})
+ if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
+ set(HAVE_ICONV 1)
+ endif()
endif()
# function checks
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 87543eb66f75b9..ce506b9b226da3 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -317,14 +317,6 @@ if(ICU_FOUND)
PRIVATE
${ICU_LIBRARIES}
)
-else()
- # Link iconv library if it is an external library.
- if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
- target_link_libraries(LLVMSupport
- PRIVATE
- ${Iconv_LIBRARIES}
- )
- endif()
endif()
set(llvm_system_libs ${system_libs})
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 1a49d665fdbda0..330f420c452232 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -215,7 +215,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
// Setup the output. We directly write into the SmallVector.
Result.resize_for_overwrite(Source.size());
size_t Capacity = Result.capacity();
- char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ char *Output = static_cast<char *>(Result.data());
size_t OutputLength = Capacity;
size_t Ret;
@@ -265,7 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
std::error_code
CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
- return convert(nullptr, Result);
+ return convert("", Result, /*ShouldAutoFlush=*/true);
}
#endif // HAVE_ICONV
>From 0501f35f7ff30523cb187147e4aee4f2d7cfbe77 Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree at users.noreply.github.com>
Date: Wed, 17 Apr 2024 08:33:03 -0400
Subject: [PATCH 07/19] Update llvm/cmake/config-ix.cmake
Co-authored-by: Eli Friedman <efriedma at quicinc.com>
---
llvm/cmake/config-ix.cmake | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index eee54d92b78f4e..492941bf32021f 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -292,7 +292,7 @@ endif()
if(LLVM_ENABLE_ICONV)
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
find_package(Iconv REQUIRED)
- if (NOT Iconv_FOUND)
+ if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
endif()
else()
>From 81db614057665a199ab88bc49ffd1b5c6b4851bf Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 19 Apr 2024 11:55:35 -0400
Subject: [PATCH 08/19] address comments
---
llvm/include/llvm/Support/CharSet.h | 2 +-
llvm/lib/Support/CharSet.cpp | 16 ++++++++--------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index e573b3da9d7cc3..b2c50cd423d6da 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -131,7 +131,7 @@ class CharSetConverter {
ErrorOr<std::string> convert(StringRef Source,
bool ShouldAutoFlush = true) const {
- SmallString<1> Result;
+ SmallString<100> Result;
auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
if (!EC)
return std::string(Result);
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 330f420c452232..55eb5f3f1ad6d6 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -47,7 +47,7 @@ static void normalizeCharSetName(StringRef CSName,
}
// Maps the charset name to enum constant if possible.
-std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
SmallString<16> Normalized;
normalizeCharSetName(CSName, Normalized);
if (Normalized.equals("utf8"))
@@ -57,8 +57,9 @@ std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
return std::nullopt;
}
-void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
- SmallVectorImpl<char> &Result) {
+static void HandleOverflow(size_t &Capacity, char *&Output,
+ size_t &OutputLength,
+ SmallVectorImpl<char> &Result) {
// No space left in output buffer. Double the size of the underlying
// memory in the SmallVectorImpl, adjust pointer and length and continue
// the conversion.
@@ -150,9 +151,10 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const {
// Setup the output. We directly write into the SmallVector.
- Result.resize_for_overwrite(Source.size());
- size_t OutputLength, Capacity = Result.capacity();
+ size_t Capacity = Result.capacity();
+ size_t OutputLength = Capacity;
char *Output, *Out;
+ Result.resize_for_overwrite(Capacity);
UErrorCode EC = U_ZERO_ERROR;
@@ -163,9 +165,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
InputLength ? const_cast<char *>(Source.data()) : nullptr;
const char *In = Input;
Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
- OutputLength = Capacity;
Out = Output;
- Result.resize_for_overwrite(Capacity);
ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
&Input, In + InputLength, /*pivotStart=*/NULL,
/*pivotSource=*/NULL, /*pivotTarget=*/NULL,
@@ -177,7 +177,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
HandleOverflow(Capacity, Output, OutputLength, Result);
else
// Some other error occured.
- return std::error_code(errno, std::generic_category());
+ return std::error_code(EILSEQ, std::generic_category());
} else if (U_SUCCESS(EC))
break;
} while (U_FAILURE(EC));
>From 6926d09ed05aae337a3a6139a4b29df162af13a6 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 22 Apr 2024 13:27:47 -0400
Subject: [PATCH 09/19] remove function to get shift back characters, address
comments
---
llvm/include/llvm/Support/CharSet.h | 6 --
llvm/lib/Support/CharSet.cpp | 31 +++-------
llvm/unittests/Support/CharSetTest.cpp | 85 --------------------------
3 files changed, 7 insertions(+), 115 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index b2c50cd423d6da..b3bc138518b1a0 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -63,8 +63,6 @@ class CharSetConverterImplBase {
/// set the shift state to the initial state.
/// Otherwise this is a no-op.
virtual std::error_code flush() const = 0;
-
- virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
};
} // namespace details
@@ -139,10 +137,6 @@ class CharSetConverter {
}
std::error_code flush() const { return Converter->flush(); }
-
- std::error_code flush(SmallVectorImpl<char> &Result) const {
- return Converter->flush(Result);
- }
};
} // namespace llvm
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 55eb5f3f1ad6d6..73cd34b535b106 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -91,7 +91,6 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const override;
std::error_code flush() const override;
- std::error_code flush(SmallVectorImpl<char> &Result) const override;
};
std::error_code CharSetConverterTable::convert(StringRef Source,
@@ -111,11 +110,6 @@ std::error_code CharSetConverterTable::flush() const {
return std::error_code();
}
-std::error_code
-CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
- return std::error_code();
-}
-
#ifdef HAVE_ICU
class CharSetConverterICU : public details::CharSetConverterImplBase {
UConverter *FromConvDesc;
@@ -144,7 +138,6 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const override;
std::error_code flush() const override;
- std::error_code flush(SmallVectorImpl<char> &Result) const override;
};
std::error_code CharSetConverterICU::convert(StringRef Source,
@@ -164,7 +157,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
const char *Input =
InputLength ? const_cast<char *>(Source.data()) : nullptr;
const char *In = Input;
- Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ Output = static_cast<char *>(Result.data());
Out = Output;
ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
&Input, In + InputLength, /*pivotStart=*/NULL,
@@ -173,14 +166,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
/*flush=*/ShouldAutoFlush, &EC);
if (U_FAILURE(EC)) {
if (EC == U_BUFFER_OVERFLOW_ERROR &&
- Capacity < std::numeric_limits<size_t>::max())
+ Capacity < std::numeric_limits<size_t>::max()) {
HandleOverflow(Capacity, Output, OutputLength, Result);
- else
+ continue;
+ } else
// Some other error occured.
return std::error_code(EILSEQ, std::generic_category());
- } else if (U_SUCCESS(EC))
- break;
- } while (U_FAILURE(EC));
+ }
+ break;
+ } while (true);
Result.resize(Output - Out);
return std::error_code();
@@ -188,11 +182,6 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
-std::error_code
-CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
- return std::error_code();
-}
-
#elif defined(HAVE_ICONV)
class CharSetConverterIconv : public details::CharSetConverterImplBase {
iconv_t ConvDesc;
@@ -203,7 +192,6 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const override;
std::error_code flush() const override;
- std::error_code flush(SmallVectorImpl<char> &Result) const override;
};
std::error_code CharSetConverterIconv::convert(StringRef Source,
@@ -263,11 +251,6 @@ std::error_code CharSetConverterIconv::flush() const {
return std::error_code();
}
-std::error_code
-CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
- return convert("", Result, /*ShouldAutoFlush=*/true);
-}
-
#endif // HAVE_ICONV
} // namespace
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 2f2d8f97102b98..4628a44ef7fff2 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -193,89 +193,4 @@ TEST(CharSet, ShiftStateIBM939) {
EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
}
-#if not defined(HAVE_ICU) && defined(HAVE_ICONV)
-
-// Identical to EarthUTF, except the final character (球) has its last byte
-// taken away from it.
-static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90";
-static const char EarthISO2022ShiftBack[] =
- "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65";
-static const char ShiftBackOnly[] = "\x1B\x28\x42";
-
-// String "地球".
-static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83";
-static const char EarthKanjiOnlyISO2022[] =
- "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42";
-static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f";
-
-TEST(CharSet, ShiftState2022Flush) {
- StringRef Src0(EarthUTFBroken);
- StringRef Src1(EarthKanjiOnlyUTF);
- SmallString<64> Dst0;
- SmallString<64> Dst1;
- ErrorOr<CharSetConverter> ConvTo2022Flush =
- CharSetConverter::create("UTF-8", "ISO-2022-JP");
- if (!ConvTo2022Flush) {
- ASSERT_EQ(ConvTo2022Flush.getError(),
- std::make_error_code(std::errc::invalid_argument));
- return;
- }
-
- // This should emit an error; there is a malformed multibyte character in the
- // input string.
- std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true);
- EXPECT_TRUE(EC0);
- std::error_code EC1 = ConvTo2022Flush->flush();
- EXPECT_TRUE(!EC1);
- std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true);
- EXPECT_TRUE(!EC2);
- EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast<std::string>(Dst1).c_str());
-}
-
-TEST(CharSet, ShiftStateIBM939Flush) {
- StringRef Src0(EarthUTFBroken);
- StringRef Src1(EarthKanjiOnlyUTF);
- SmallString<64> Dst0;
- SmallString<64> Dst1;
- ErrorOr<CharSetConverter> ConvTo939Flush =
- CharSetConverter::create("UTF-8", "IBM-939");
- if (!ConvTo939Flush) {
- ASSERT_EQ(ConvTo939Flush.getError(),
- std::make_error_code(std::errc::invalid_argument));
- return;
- }
-
- // This should emit an error; there is a malformed multibyte character in the
- // input string.
- std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true);
- EXPECT_TRUE(EC0);
- std::error_code EC1 = ConvTo939Flush->flush();
- EXPECT_TRUE(!EC1);
- std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true);
- EXPECT_TRUE(!EC2);
- EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast<std::string>(Dst1).c_str());
-}
-
-TEST(CharSet, ShiftState2022Flush1) {
- StringRef Src0(EarthUTF);
- SmallString<64> Dst0;
- SmallString<64> Dst1;
- ErrorOr<CharSetConverter> ConvTo2022Flush =
- CharSetConverter::create("UTF-8", "ISO-2022-JP");
- if (!ConvTo2022Flush) {
- ASSERT_EQ(ConvTo2022Flush.getError(),
- std::make_error_code(std::errc::invalid_argument));
- return;
- }
-
- std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false);
- EXPECT_TRUE(!EC0);
- EXPECT_STREQ(EarthISO2022ShiftBack, static_cast<std::string>(Dst0).c_str());
- std::error_code EC1 = ConvTo2022Flush->flush(Dst1);
- EXPECT_TRUE(!EC1);
- EXPECT_STREQ(ShiftBackOnly, static_cast<std::string>(Dst1).c_str());
-}
-
-#endif
-
} // namespace
>From 6f558d9dee6a87100111b782c761743ebd178ab0 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 23 Apr 2024 08:09:16 -0400
Subject: [PATCH 10/19] remove other flush function as well
---
llvm/include/llvm/Support/CharSet.h | 3 ---
llvm/lib/Support/CharSet.cpp | 17 -----------------
2 files changed, 20 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index b3bc138518b1a0..1500ccae0a24b6 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -62,7 +62,6 @@ class CharSetConverterImplBase {
/// If the destination character set is a stateful character set,
/// set the shift state to the initial state.
/// Otherwise this is a no-op.
- virtual std::error_code flush() const = 0;
};
} // namespace details
@@ -135,8 +134,6 @@ class CharSetConverter {
return std::string(Result);
return EC;
}
-
- std::error_code flush() const { return Converter->flush(); }
};
} // namespace llvm
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 73cd34b535b106..52f00b736af451 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -90,7 +90,6 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const override;
- std::error_code flush() const override;
};
std::error_code CharSetConverterTable::convert(StringRef Source,
@@ -106,10 +105,6 @@ std::error_code CharSetConverterTable::convert(StringRef Source,
return std::error_code();
}
-std::error_code CharSetConverterTable::flush() const {
- return std::error_code();
-}
-
#ifdef HAVE_ICU
class CharSetConverterICU : public details::CharSetConverterImplBase {
UConverter *FromConvDesc;
@@ -137,7 +132,6 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const override;
- std::error_code flush() const override;
};
std::error_code CharSetConverterICU::convert(StringRef Source,
@@ -180,8 +174,6 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
return std::error_code();
}
-std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
-
#elif defined(HAVE_ICONV)
class CharSetConverterIconv : public details::CharSetConverterImplBase {
iconv_t ConvDesc;
@@ -191,7 +183,6 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const override;
- std::error_code flush() const override;
};
std::error_code CharSetConverterIconv::convert(StringRef Source,
@@ -243,14 +234,6 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
return std::error_code();
}
-std::error_code CharSetConverterIconv::flush() const {
- size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
- if (Ret == static_cast<size_t>(-1)) {
- return std::error_code(errno, std::generic_category());
- }
- return std::error_code();
-}
-
#endif // HAVE_ICONV
} // namespace
>From 53be2d6050f1492e2364969760348fdb4869ea71 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 24 Apr 2024 10:43:41 -0400
Subject: [PATCH 11/19] update comments
---
llvm/include/llvm/Support/CharSet.h | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 1500ccae0a24b6..55d75d25102c1d 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -51,17 +51,12 @@ class CharSetConverterImplBase {
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
///
+ /// If the destination charset is a stateful character set, the shift state
+ /// will be set to the initial state.
virtual std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result,
bool ShouldAutoFlush) const = 0;
-
- /// Restore the conversion to the original state.
- /// \return error code in case something went wrong
- ///
- /// If the destination character set is a stateful character set,
- /// set the shift state to the initial state.
- /// Otherwise this is a no-op.
};
} // namespace details
>From b99dca555e75a370fe0f1b798906b9307e075c9c Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 29 Apr 2024 09:52:08 -0400
Subject: [PATCH 12/19] reset iconv if failed, cause overflow in testcase
---
llvm/lib/Support/CharSet.cpp | 3 +++
llvm/unittests/Support/CharSetTest.cpp | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 52f00b736af451..55e0e2f1692346 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -66,6 +66,7 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
? 2 * Capacity
: std::numeric_limits<size_t>::max();
+ Result.resize(0);
Result.resize_for_overwrite(Capacity);
Output = static_cast<char *>(Result.data());
OutputLength = Capacity;
@@ -205,6 +206,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
// An error occured. Check if we can gracefully handle it.
if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
HandleOverflow(Capacity, Output, OutputLength, Result);
+ // Reset converter
+ iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
return std::error_code();
} else {
// Some other error occured.
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 4628a44ef7fff2..25f3455753908b 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -156,7 +156,7 @@ TEST(CharSet, RoundTrip) {
TEST(CharSet, ShiftState2022) {
// Earth string.
StringRef Src(EarthUTF);
- SmallString<64> Dst;
+ SmallString<8> Dst;
ErrorOr<CharSetConverter> ConvTo2022 =
CharSetConverter::create("UTF-8", "ISO-2022-JP");
>From 290e2bc65fb944d27853c6f3e4e5e0b8149f802e Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 29 Apr 2024 15:03:39 -0400
Subject: [PATCH 13/19] remove AutoFlush, remove stray comment
---
llvm/include/llvm/Support/CharSet.h | 18 ++++-------
llvm/lib/Support/CharSet.cpp | 43 +++++++++++++-------------
llvm/unittests/Support/CharSetTest.cpp | 26 +++++++---------
3 files changed, 40 insertions(+), 47 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 55d75d25102c1d..c1089b744ef764 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -36,8 +36,6 @@ class CharSetConverterImplBase {
/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
- /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
- /// for stateful encodings if true.
/// \return error code in case something went wrong
///
/// The following error codes can occur, among others:
@@ -55,8 +53,7 @@ class CharSetConverterImplBase {
/// will be set to the initial state.
virtual std::error_code convert(StringRef Source,
- SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const = 0;
+ SmallVectorImpl<char> &Result) const = 0;
};
} // namespace details
@@ -113,18 +110,15 @@ class CharSetConverter {
/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
- /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
- /// for stateful encodings.
/// \return error code in case something went wrong
- std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush = true) const {
- return Converter->convert(Source, Result, ShouldAutoFlush);
+ std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const {
+ return Converter->convert(Source, Result);
}
- ErrorOr<std::string> convert(StringRef Source,
- bool ShouldAutoFlush = true) const {
+ ErrorOr<std::string> convert(StringRef Source) const {
SmallString<100> Result;
- auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
+ auto EC = Converter->convert(Source, Result);
if (!EC)
return std::string(Result);
return EC;
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 55e0e2f1692346..c00a1894e91708 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -57,6 +57,7 @@ static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
return std::nullopt;
}
+#if defined(HAVE_ICONV) || defined(HAVE_ICU)
static void HandleOverflow(size_t &Capacity, char *&Output,
size_t &OutputLength,
SmallVectorImpl<char> &Result) {
@@ -71,6 +72,7 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
Output = static_cast<char *>(Result.data());
OutputLength = Capacity;
}
+#endif
namespace {
enum ConversionType {
@@ -89,13 +91,13 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
public:
CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
- std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const override;
+ std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const override;
};
-std::error_code CharSetConverterTable::convert(StringRef Source,
- SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const {
+std::error_code
+CharSetConverterTable::convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const {
if (ConvType == IBM1047ToUTF) {
ConverterEBCDIC::convertToUTF8(Source, Result);
return std::error_code();
@@ -131,13 +133,13 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
ToConvDesc = nullptr;
}
- std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const override;
+ std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const override;
};
-std::error_code CharSetConverterICU::convert(StringRef Source,
- SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const {
+std::error_code
+CharSetConverterICU::convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const {
// Setup the output. We directly write into the SmallVector.
size_t Capacity = Result.capacity();
size_t OutputLength = Capacity;
@@ -158,7 +160,7 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
&Input, In + InputLength, /*pivotStart=*/NULL,
/*pivotSource=*/NULL, /*pivotTarget=*/NULL,
/*pivotLimit=*/NULL, /*reset=*/true,
- /*flush=*/ShouldAutoFlush, &EC);
+ /*flush=*/true, &EC);
if (U_FAILURE(EC)) {
if (EC == U_BUFFER_OVERFLOW_ERROR &&
Capacity < std::numeric_limits<size_t>::max()) {
@@ -182,13 +184,13 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
public:
CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
- std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const override;
+ std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const override;
};
-std::error_code CharSetConverterIconv::convert(StringRef Source,
- SmallVectorImpl<char> &Result,
- bool ShouldAutoFlush) const {
+std::error_code
+CharSetConverterIconv::convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const {
// Setup the input. Use nullptr to reset iconv state if input length is zero.
size_t InputLength = Source.size();
char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
@@ -226,11 +228,10 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
if (auto EC = HandleError(Ret))
return EC;
- if (ShouldAutoFlush) {
- while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
- if (auto EC = HandleError(Ret))
- return EC;
- }
+ // Flush the converter
+ while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+ if (auto EC = HandleError(Ret))
+ return EC;
// Re-adjust size to actual size.
Result.resize(Capacity - OutputLength);
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 25f3455753908b..579e21a86e18e5 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -46,8 +46,6 @@ static const char CyrillicUTF[] = "\xd0\xaf";
// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
// back.
static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
-// Identical to above, except the final character (球) has its last byte taken
-// away from it.
static const char EarthISO2022[] =
"\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
static const char EarthIBM939[] =
@@ -60,28 +58,28 @@ TEST(CharSet, FromUTF8) {
CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8,
text_encoding::id::IBM1047);
- std::error_code EC = Conv.convert(Src, Dst, true);
+ std::error_code EC = Conv.convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
Dst.clear();
// ABC string.
Src = ABCStrA;
- EC = Conv.convert(Src, Dst, true);
+ EC = Conv.convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
Dst.clear();
// Accent string.
Src = AccentUTF;
- EC = Conv.convert(Src, Dst, true);
+ EC = Conv.convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
Dst.clear();
// Cyrillic string. Results in error because not representable in 1047.
Src = CyrillicUTF;
- EC = Conv.convert(Src, Dst, true);
+ EC = Conv.convert(Src, Dst);
EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
}
@@ -92,21 +90,21 @@ TEST(CharSet, ToUTF8) {
CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047,
text_encoding::id::UTF8);
- std::error_code EC = Conv.convert(Src, Dst, true);
+ std::error_code EC = Conv.convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
Dst.clear();
// ABC string.
Src = ABCStrE;
- EC = Conv.convert(Src, Dst, true);
+ EC = Conv.convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
Dst.clear();
// Accent string.
Src = AccentE;
- EC = Conv.convert(Src, Dst, true);
+ EC = Conv.convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
}
@@ -144,11 +142,11 @@ TEST(CharSet, RoundTrip) {
SmallString<99> Dst1Str, Dst2Str, Dst3Str;
- std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true);
+ std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str);
EXPECT_TRUE(!EC);
- EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true);
+ EC = ConvToUTF32->convert(Dst1Str, Dst2Str);
EXPECT_TRUE(!EC);
- EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true);
+ EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str);
EXPECT_TRUE(!EC);
EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
}
@@ -168,7 +166,7 @@ TEST(CharSet, ShiftState2022) {
}
// Check that the string is properly converted.
- std::error_code EC = ConvTo2022->convert(Src, Dst, true);
+ std::error_code EC = ConvTo2022->convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
}
@@ -188,7 +186,7 @@ TEST(CharSet, ShiftStateIBM939) {
}
// Check that the string is properly converted.
- std::error_code EC = ConvToIBM939->convert(Src, Dst, true);
+ std::error_code EC = ConvToIBM939->convert(Src, Dst);
EXPECT_TRUE(!EC);
EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
}
>From f1d4e8ee37437fec4c97d5ba9b31c6298348878b Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree at users.noreply.github.com>
Date: Wed, 1 May 2024 09:17:27 -0400
Subject: [PATCH 14/19] formatting nits
Remove comment that looks like code (unique_ptr should be easy enough to understand).
Co-authored-by: Hubert Tong <hubert-reinterpretcast at users.noreply.github.com>
---
llvm/cmake/config-ix.cmake | 4 ++--
llvm/include/llvm/Config/config.h.cmake | 2 +-
llvm/include/llvm/Support/CharSet.h | 16 +++++++---------
llvm/lib/Support/CMakeLists.txt | 2 +-
llvm/lib/Support/CharSet.cpp | 9 ++++-----
5 files changed, 15 insertions(+), 18 deletions(-)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 492941bf32021f..b503a337f488ab 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -272,14 +272,14 @@ if(LLVM_HAS_LOGF128)
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
endif()
-#Check for icu.
+# Check for ICU.
if(LLVM_ENABLE_ICU)
set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so")
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
find_package(ICU REQUIRED COMPONENTS uc i18n)
if (NOT ICU_FOUND)
- message(FATAL_ERROR "Failed to configure icu, but LLVM_ENABLE_ICU is FORCE_ON")
+ message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
endif()
else()
find_package(ICU COMPONENTS uc i18n)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 77d352fc50e77e..ca82e1c8a30b12 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -279,7 +279,7 @@
/* Have host's ___chkstk_ms */
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
-/* Define if icu library is available */
+/* Define if ICU library is available */
#cmakedefine HAVE_ICU ${HAVE_ICU}
/* Define if iconv library is available */
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index c1089b744ef764..0d789e3ab637a0 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -46,12 +46,12 @@ class CharSetConverterImplBase {
/// - std::errc::invalid_argument: The input contains an incomplete
/// multibyte sequence.
///
+ /// If the destination charset is a stateful character set, the shift state
+ /// will be set to the initial state.
+ ///
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
///
- /// If the destination charset is a stateful character set, the shift state
- /// will be set to the initial state.
-
virtual std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result) const = 0;
};
@@ -70,7 +70,6 @@ enum class id {
/// Utility class to convert between different character set encodings.
class CharSetConverter {
- // details::CharSetConverterImplBase *Converter;
std::unique_ptr<details::CharSetConverterImplBase> Converter;
CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
@@ -78,8 +77,8 @@ class CharSetConverter {
public:
/// Creates a CharSetConverter instance.
- /// \param[in] CSFrom name of the source character encoding
- /// \param[in] CSTo name of the target character encoding
+ /// \param[in] CSFrom the source character encoding
+ /// \param[in] CSTo the target character encoding
/// \return a CharSetConverter instance
static CharSetConverter create(text_encoding::id CSFrom,
text_encoding::id CSTo);
@@ -95,9 +94,8 @@ class CharSetConverter {
CharSetConverter(const CharSetConverter &) = delete;
CharSetConverter &operator=(const CharSetConverter &) = delete;
- CharSetConverter(CharSetConverter &&Other) {
- Converter = std::move(Other.Converter);
- }
+ CharSetConverter(CharSetConverter &&Other)
+ : Converter(std::move(Other.Converter)) {}
CharSetConverter &operator=(CharSetConverter &&Other) {
if (this != &Other)
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index ce506b9b226da3..f7284361903766 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -311,7 +311,7 @@ add_llvm_component_library(LLVMSupport
Demangle
)
-# Link icu library if it is an external library.
+# Link ICU library if it is an external library.
if(ICU_FOUND)
target_link_libraries(LLVMSupport
PRIVATE
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index c00a1894e91708..2b984582794f97 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -8,7 +8,7 @@
///
/// \file
/// This file provides utility classes to convert between different character
-/// set encoding.
+/// set encodings.
///
//===----------------------------------------------------------------------===//
@@ -80,7 +80,7 @@ enum ConversionType {
IBM1047ToUTF,
};
-// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// Support conversion between EBCDIC 1047 and UTF-8. This class uses
// built-in translation tables that allow for translation between the
// aforementioned character sets. The use of tables for conversion is only
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
@@ -166,9 +166,8 @@ CharSetConverterICU::convert(StringRef Source,
Capacity < std::numeric_limits<size_t>::max()) {
HandleOverflow(Capacity, Output, OutputLength, Result);
continue;
- } else
- // Some other error occured.
- return std::error_code(EILSEQ, std::generic_category());
+ // Some other error occured.
+ return std::error_code(EILSEQ, std::generic_category());
}
break;
} while (true);
>From 0139c9e1b1957366f3b394adbdc95db8de843106 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 24 May 2024 11:43:45 -0400
Subject: [PATCH 15/19] Refactor ICU code
---
llvm/lib/Support/CharSet.cpp | 81 ++++++++++++++++++------------------
1 file changed, 40 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 2b984582794f97..0f88d0b9056c74 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -76,8 +76,8 @@ static void HandleOverflow(size_t &Capacity, char *&Output,
namespace {
enum ConversionType {
- UTFToIBM1047,
- IBM1047ToUTF,
+ UTF8ToIBM1047,
+ IBM1047ToUTF8,
};
// Support conversion between EBCDIC 1047 and UTF-8. This class uses
@@ -98,10 +98,10 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
std::error_code
CharSetConverterTable::convert(StringRef Source,
SmallVectorImpl<char> &Result) const {
- if (ConvType == IBM1047ToUTF) {
+ if (ConvType == IBM1047ToUTF8) {
ConverterEBCDIC::convertToUTF8(Source, Result);
return std::error_code();
- } else if (ConvType == UTFToIBM1047) {
+ } else if (ConvType == UTF8ToIBM1047) {
return ConverterEBCDIC::convertToEBCDIC(Source, Result);
}
llvm_unreachable("Invalid ConvType!");
@@ -109,29 +109,23 @@ CharSetConverterTable::convert(StringRef Source,
}
#ifdef HAVE_ICU
+struct UConverterDeleter {
+ void operator()(UConverter *Converter) const {
+ if (Converter)
+ ucnv_close(Converter);
+ }
+};
+using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
+
class CharSetConverterICU : public details::CharSetConverterImplBase {
- UConverter *FromConvDesc;
- UConverter *ToConvDesc;
+ UConverterUniquePtr FromConvDesc;
+ UConverterUniquePtr ToConvDesc;
public:
- CharSetConverterICU(UConverter *Converter) {
- UErrorCode EC = U_ZERO_ERROR;
- FromConvDesc = nullptr;
- ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
- if (U_FAILURE(EC)) {
- ToConvDesc = nullptr;
- }
- };
-
- CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
- UErrorCode EC = U_ZERO_ERROR;
- FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
- if (U_FAILURE(EC))
- FromConvDesc = nullptr;
- ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
- if (U_FAILURE(EC))
- ToConvDesc = nullptr;
- }
+ CharSetConverterICU(UConverterUniquePtr FromConverter,
+ UConverterUniquePtr ToConverter)
+ : FromConvDesc(std::move(FromConverter)),
+ ToConvDesc(std::move(ToConverter)) {}
std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result) const override;
@@ -140,24 +134,23 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
std::error_code
CharSetConverterICU::convert(StringRef Source,
SmallVectorImpl<char> &Result) const {
+ // Setup the input in case it has no backing data.
+ size_t InputLength = Source.size();
+ const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
+
// Setup the output. We directly write into the SmallVector.
size_t Capacity = Result.capacity();
size_t OutputLength = Capacity;
- char *Output, *Out;
Result.resize_for_overwrite(Capacity);
-
+ char *Output = static_cast<char *>(Result.data());
UErrorCode EC = U_ZERO_ERROR;
-
do {
EC = U_ZERO_ERROR;
- size_t InputLength = Source.size();
- const char *Input =
- InputLength ? const_cast<char *>(Source.data()) : nullptr;
- const char *In = Input;
- Output = static_cast<char *>(Result.data());
- Out = Output;
- ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
- &Input, In + InputLength, /*pivotStart=*/NULL,
+ const char *Input = In;
+
+ Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
+ In + InputLength, /*pivotStart=*/NULL,
/*pivotSource=*/NULL, /*pivotTarget=*/NULL,
/*pivotLimit=*/NULL, /*reset=*/true,
/*flush=*/true, &EC);
@@ -166,13 +159,14 @@ CharSetConverterICU::convert(StringRef Source,
Capacity < std::numeric_limits<size_t>::max()) {
HandleOverflow(Capacity, Output, OutputLength, Result);
continue;
+ }
// Some other error occured.
return std::error_code(EILSEQ, std::generic_category());
}
break;
} while (true);
- Result.resize(Output - Out);
+ Result.resize(Output - Result.data());
return std::error_code();
}
@@ -247,9 +241,13 @@ CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
ConversionType Conversion;
if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
- Conversion = UTFToIBM1047;
+ Conversion = UTF8ToIBM1047;
+ else if (CPFrom == text_encoding::id::IBM1047 &&
+ CPTo == text_encoding::id::UTF8)
+ Conversion = IBM1047ToUTF8;
else
- Conversion = IBM1047ToUTF;
+ assert(false &&
+ "Only conversions between UTF-8 and IBM-1047 are supported");
std::unique_ptr<details::CharSetConverterImplBase> Converter =
std::make_unique<CharSetConverterTable>(Conversion);
@@ -264,16 +262,17 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
return create(*From, *To);
#ifdef HAVE_ICU
UErrorCode EC = U_ZERO_ERROR;
- UConverter *FromConvDesc = ucnv_open(CSFrom.str().c_str(), &EC);
+ UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
if (U_FAILURE(EC)) {
return std::error_code(errno, std::generic_category());
}
- UConverter *ToConvDesc = ucnv_open(CSTo.str().c_str(), &EC);
+ UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC));
if (U_FAILURE(EC)) {
return std::error_code(errno, std::generic_category());
}
std::unique_ptr<details::CharSetConverterImplBase> Converter =
- std::make_unique<CharSetConverterICU>(FromConvDesc, ToConvDesc);
+ std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
+ std::move(ToConvDesc));
return CharSetConverter(std::move(Converter));
#elif defined(HAVE_ICONV)
iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
>From 2a8da8e1481e05e894e895c7c8adaa57a5954317 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 29 May 2024 13:25:13 -0400
Subject: [PATCH 16/19] refactor iconv
---
llvm/lib/Support/CharSet.cpp | 65 +++++++++++++++++++++++++++---------
1 file changed, 49 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 0f88d0b9056c74..8b89eb5c1129d3 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -172,10 +172,34 @@ CharSetConverterICU::convert(StringRef Source,
#elif defined(HAVE_ICONV)
class CharSetConverterIconv : public details::CharSetConverterImplBase {
- iconv_t ConvDesc;
+ class UniqueIconvT {
+ iconv_t ConvDesc;
+
+ public:
+ operator iconv_t() const { return ConvDesc; }
+ UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
+ ~UniqueIconvT() {
+ if (ConvDesc != (iconv_t)-1) {
+ iconv_close(ConvDesc);
+ ConvDesc = (iconv_t)-1;
+ }
+ }
+ UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
+ Other.ConvDesc = (iconv_t)-1;
+ }
+ UniqueIconvT &operator=(UniqueIconvT &&Other) {
+ if (&Other != this) {
+ ConvDesc = Other.ConvDesc;
+ Other.ConvDesc = (iconv_t)-1;
+ }
+ return *this;
+ }
+ };
+ UniqueIconvT ConvDesc;
public:
- CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+ CharSetConverterIconv(UniqueIconvT ConvDesc)
+ : ConvDesc(std::move(ConvDesc)) {}
std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result) const override;
@@ -184,19 +208,16 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
std::error_code
CharSetConverterIconv::convert(StringRef Source,
SmallVectorImpl<char> &Result) const {
- // Setup the input. Use nullptr to reset iconv state if input length is zero.
- size_t InputLength = Source.size();
- char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
// Setup the output. We directly write into the SmallVector.
- Result.resize_for_overwrite(Source.size());
size_t Capacity = Result.capacity();
char *Output = static_cast<char *>(Result.data());
size_t OutputLength = Capacity;
+ Result.resize_for_overwrite(Capacity);
size_t Ret;
-
// Handle errors returned from iconv().
- auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+ auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
+ this](size_t Ret) {
if (Ret == static_cast<size_t>(-1)) {
// An error occured. Check if we can gracefully handle it.
if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
@@ -217,14 +238,26 @@ CharSetConverterIconv::convert(StringRef Source,
}
};
- // Convert the string.
- while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
- if (auto EC = HandleError(Ret))
- return EC;
- // Flush the converter
- while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
- if (auto EC = HandleError(Ret))
- return EC;
+ do {
+ // Setup the input. Use nullptr to reset iconv state if input length is
+ // zero.
+ size_t InputLength = Source.size();
+ char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+ Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
+ if (Ret != 0) {
+ if (auto EC = HandleError(Ret))
+ return EC;
+ continue;
+ }
+ // Flush the converter
+ Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
+ if (Ret != 0) {
+ if (auto EC = HandleError(Ret))
+ return EC;
+ continue;
+ }
+ break;
+ } while (true);
// Re-adjust size to actual size.
Result.resize(Capacity - OutputLength);
>From aad2f4cf052924f295717e78a96333315c93ef35 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 31 May 2024 15:22:14 -0400
Subject: [PATCH 17/19] resize output if error
---
llvm/lib/Support/CharSet.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 8b89eb5c1129d3..ce8cbc217e552b 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -161,6 +161,7 @@ CharSetConverterICU::convert(StringRef Source,
continue;
}
// Some other error occured.
+ Result.resize(Output - Result.data());
return std::error_code(EILSEQ, std::generic_category());
}
break;
@@ -227,6 +228,7 @@ CharSetConverterIconv::convert(StringRef Source,
return std::error_code();
} else {
// Some other error occured.
+ Result.resize(Output - Result.data());
return std::error_code(errno, std::generic_category());
}
} else {
>From 72926e6c97ce7a189be21acc4c1dbde49c519898 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 6 Jan 2025 10:39:05 -0500
Subject: [PATCH 18/19] address some comments
---
llvm/cmake/config-ix.cmake | 2 +-
llvm/include/llvm/Support/CharSet.h | 18 ++++++--
llvm/lib/Support/CharSet.cpp | 47 +++++++++++++-------
llvm/unittests/Support/ConvertEBCDICTest.cpp | 4 +-
4 files changed, 48 insertions(+), 23 deletions(-)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index b503a337f488ab..052fd267505f94 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -288,7 +288,7 @@ if(LLVM_ENABLE_ICU)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
endif()
-# Check for iconv.
+# Check for builtin iconv to avoid licensing issues.
if(LLVM_ENABLE_ICONV)
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
find_package(Iconv REQUIRED)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 0d789e3ab637a0..a0c9ba36a0f47f 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -33,6 +33,9 @@ class CharSetConverterImplBase {
public:
virtual ~CharSetConverterImplBase() = default;
+ /// Resets the converter to the initial state.
+ virtual void reset() = 0;
+
/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
@@ -52,8 +55,12 @@ class CharSetConverterImplBase {
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
///
- virtual std::error_code convert(StringRef Source,
- SmallVectorImpl<char> &Result) const = 0;
+
+ std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const;
+
+ virtual std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) = 0;
};
} // namespace details
@@ -111,12 +118,15 @@ class CharSetConverter {
/// \return error code in case something went wrong
std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result) const {
- return Converter->convert(Source, Result);
+ auto EC = Converter->convertString(Source, Result);
+ Converter->reset();
+ return EC;
}
ErrorOr<std::string> convert(StringRef Source) const {
SmallString<100> Result;
- auto EC = Converter->convert(Source, Result);
+ auto EC = Converter->convertString(Source, Result);
+ Converter->reset();
if (!EC)
return std::string(Result);
return EC;
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index ce8cbc217e552b..1ec91975d8159b 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -91,13 +91,15 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
public:
CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
- std::error_code convert(StringRef Source,
- SmallVectorImpl<char> &Result) const override;
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override;
+
+ void reset() override {}
};
std::error_code
-CharSetConverterTable::convert(StringRef Source,
- SmallVectorImpl<char> &Result) const {
+CharSetConverterTable::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
if (ConvType == IBM1047ToUTF8) {
ConverterEBCDIC::convertToUTF8(Source, Result);
return std::error_code();
@@ -127,13 +129,15 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
: FromConvDesc(std::move(FromConverter)),
ToConvDesc(std::move(ToConverter)) {}
- std::error_code convert(StringRef Source,
- SmallVectorImpl<char> &Result) const override;
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override;
+
+ void reset() override;
};
std::error_code
-CharSetConverterICU::convert(StringRef Source,
- SmallVectorImpl<char> &Result) const {
+CharSetConverterICU::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
// Setup the input in case it has no backing data.
size_t InputLength = Source.size();
const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
@@ -171,6 +175,11 @@ CharSetConverterICU::convert(StringRef Source,
return std::error_code();
}
+void CharSetConverterICU::reset() {
+ ucnv_reset(&*FromConvDesc);
+ ucnv_reset(&*ToConvDesc);
+}
+
#elif defined(HAVE_ICONV)
class CharSetConverterIconv : public details::CharSetConverterImplBase {
class UniqueIconvT {
@@ -202,13 +211,15 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
CharSetConverterIconv(UniqueIconvT ConvDesc)
: ConvDesc(std::move(ConvDesc)) {}
- std::error_code convert(StringRef Source,
- SmallVectorImpl<char> &Result) const override;
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override;
+
+ void reset() override;
};
std::error_code
-CharSetConverterIconv::convert(StringRef Source,
- SmallVectorImpl<char> &Result) const {
+CharSetConverterIconv::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
// Setup the output. We directly write into the SmallVector.
size_t Capacity = Result.capacity();
char *Output = static_cast<char *>(Result.data());
@@ -262,10 +273,14 @@ CharSetConverterIconv::convert(StringRef Source,
} while (true);
// Re-adjust size to actual size.
- Result.resize(Capacity - OutputLength);
+ Result.resize(Output - Result.data());
return std::error_code();
}
+void CharSetConverterIconv::reset() {
+ iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+}
+
#endif // HAVE_ICONV
} // namespace
@@ -281,8 +296,7 @@ CharSetConverter CharSetConverter::create(text_encoding::id CPFrom,
CPTo == text_encoding::id::UTF8)
Conversion = IBM1047ToUTF8;
else
- assert(false &&
- "Only conversions between UTF-8 and IBM-1047 are supported");
+ llvm_unreachable("Invalid ConversionType!");
std::unique_ptr<details::CharSetConverterImplBase> Converter =
std::make_unique<CharSetConverterTable>(Conversion);
@@ -316,6 +330,7 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
std::unique_ptr<details::CharSetConverterImplBase> Converter =
std::make_unique<CharSetConverterIconv>(ConvDesc);
return CharSetConverter(std::move(Converter));
-#endif
+#else
return std::make_error_code(std::errc::invalid_argument);
+#endif
}
diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp
index eec76879ac92c5..557f29c391f9cb 100644
--- a/llvm/unittests/Support/ConvertEBCDICTest.cpp
+++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp
@@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
// String with Cyrillic character ya.
static const char CyrillicUTF[] = "\xd0\xaf";
-TEST(CharSet, FromUTF8) {
+TEST(ConverterEBCDIC, convertToEBCDIC) {
// Hello string.
StringRef Src(HelloA);
SmallString<64> Dst;
@@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) {
Dst.clear();
}
-TEST(CharSet, ToUTF8) {
+TEST(ConverterEBCDIC, convertFromEBCDIC) {
// Hello string.
StringRef Src(HelloE);
SmallString<64> Dst;
>From fd68bebafbdb0ce23b779048759b75fbf86e9fdf Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 13 Jan 2025 10:12:01 -0500
Subject: [PATCH 19/19] add callback function to properly report errors
---
llvm/lib/Support/CharSet.cpp | 6 ++++++
llvm/unittests/Support/CharSetTest.cpp | 21 +++++++++++++++++++++
2 files changed, 27 insertions(+)
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 1ec91975d8159b..509cd8209ff06d 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -148,6 +148,12 @@ CharSetConverterICU::convertString(StringRef Source,
Result.resize_for_overwrite(Capacity);
char *Output = static_cast<char *>(Result.data());
UErrorCode EC = U_ZERO_ERROR;
+
+ ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+ &EC);
+ ucnv_setToUCallBack(&*ToConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+ &EC);
+
do {
EC = U_ZERO_ERROR;
const char *Input = In;
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 579e21a86e18e5..f68411690bc8ff 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -50,6 +50,8 @@ static const char EarthISO2022[] =
"\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
static const char EarthIBM939[] =
"\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char EarthUTFExtraPartial[] =
+ "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
TEST(CharSet, FromUTF8) {
// Hello string.
@@ -171,6 +173,25 @@ TEST(CharSet, ShiftState2022) {
EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
}
+TEST(CharSet, ShiftState2022Partial) {
+ // Earth string.
+ StringRef Src(EarthUTFExtraPartial);
+ SmallString<8> Dst;
+
+ ErrorOr<CharSetConverter> ConvTo2022 =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvTo2022) {
+ ASSERT_EQ(ConvTo2022.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvTo2022->convert(Src, Dst);
+ EXPECT_TRUE(EC);
+}
+
TEST(CharSet, ShiftStateIBM939) {
// Earth string.
StringRef Src(EarthUTF);
More information about the llvm-commits
mailing list