[llvm] Create a EncodingConverter class with both iconv and icu support. (PR #138893)
Abhina Sree via llvm-commits
llvm-commits at lists.llvm.org
Thu May 15 06:08:55 PDT 2025
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/138893
>From 3192c7be06dd208a559442d067b2dba63bfe20dc Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 7 May 2025 11:16:28 -0400
Subject: [PATCH 1/5] Create a CharSetConverter class with both iconv and icu
support.
---
llvm/CMakeLists.txt | 4 +
llvm/cmake/config-ix.cmake | 35 ++
llvm/include/llvm/Config/config.h.cmake | 6 +
llvm/include/llvm/Support/CharSet.h | 141 ++++++++
llvm/lib/Support/CMakeLists.txt | 9 +
llvm/lib/Support/CharSet.cpp | 344 +++++++++++++++++++
llvm/unittests/Support/CMakeLists.txt | 1 +
llvm/unittests/Support/CharSetTest.cpp | 232 +++++++++++++
llvm/unittests/Support/ConvertEBCDICTest.cpp | 4 +-
9 files changed, 774 insertions(+), 2 deletions(-)
create mode 100644 llvm/include/llvm/Support/CharSet.h
create mode 100644 llvm/lib/Support/CharSet.cpp
create mode 100644 llvm/unittests/Support/CharSetTest.cpp
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index e8d9ec0d6153a..894c0e1d2e5ae 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -592,6 +592,10 @@ else()
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
endif()
+set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
+set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+
set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 43311dad457ec..f7e826b34d26f 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
endif()
+if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+ message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
+endif()
+
+# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
+if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
+ set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+ set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
+ if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+ find_package(ICU REQUIRED COMPONENTS uc i18n)
+ if (NOT ICU_FOUND)
+ message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
+ endif()
+ else()
+ find_package(ICU COMPONENTS uc i18n)
+ endif()
+ set(HAVE_ICU ${ICU_FOUND})
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
+endif()
+
+# Check for builtin iconv to avoid licensing issues.
+if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
+ if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
+ find_package(Iconv REQUIRED)
+ if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
+ message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
+ endif()
+ else()
+ find_package(Iconv)
+ endif()
+ if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
+ set(HAVE_ICONV 1)
+ endif()
+endif()
+
# function checks
check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 7efac55ab0352..3f70a0150da4f 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -236,6 +236,12 @@
/* Have host's ___chkstk_ms */
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
+/* Define if ICU library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
/* Linker version detected at compile time. */
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 0000000000000..6a28cd19f4143
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,141 @@
+//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+
+private:
+ /// Converts a string.
+ /// \param[in] Source source string
+ /// \param[out] Result container for converted string
+ /// \return error code in case something went wrong
+ ///
+ /// The following error codes can occur, among others:
+ /// - std::errc::argument_list_too_long: The result requires more than
+ /// std::numeric_limits<size_t>::max() bytes.
+ /// - std::errc::illegal_byte_sequence: The input contains an invalid
+ /// multibyte sequence.
+ /// - std::errc::invalid_argument: The input contains an incomplete
+ /// multibyte sequence.
+ ///
+ /// If the destination charset is a stateful character set, the shift state
+ /// will be set to the initial state.
+ ///
+ /// In case of an error, the result string contains the successfully converted
+ /// part of the input string.
+ ///
+ virtual std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) = 0;
+
+ /// Resets the converter to the initial state.
+ virtual void reset() = 0;
+
+public:
+ virtual ~CharSetConverterImplBase() = default;
+
+ /// Converts a string and resets the converter to the initial state.
+ std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
+ auto EC = convertString(Source, Result);
+ reset();
+ return EC;
+ }
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+ /// UTF-8 character set encoding.
+ UTF8,
+
+ /// IBM EBCDIC 1047 character set encoding.
+ IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+class CharSetConverter {
+ std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+ CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+ : Converter(std::move(Converter)) {}
+
+public:
+ /// Creates a CharSetConverter instance.
+ /// Returns std::errc::invalid_argument in case the requested conversion is
+ /// not supported.
+ /// \param[in] CSFrom the source character encoding
+ /// \param[in] CSTo the target character encoding
+ /// \return a CharSetConverter instance or an error code
+ static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom,
+ text_encoding::id CSTo);
+
+ /// Creates a CharSetConverter instance.
+ /// Returns std::errc::invalid_argument in case the requested conversion is
+ /// not supported.
+ /// \param[in] CPFrom name of the source character encoding
+ /// \param[in] CPTo name of the target character encoding
+ /// \return a CharSetConverter instance or an error code
+ static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+ CharSetConverter(const CharSetConverter &) = delete;
+ CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+ CharSetConverter(CharSetConverter &&Other)
+ : Converter(std::move(Other.Converter)) {}
+
+ CharSetConverter &operator=(CharSetConverter &&Other) {
+ if (this != &Other)
+ Converter = std::move(Other.Converter);
+ return *this;
+ }
+
+ ~CharSetConverter() = default;
+
+ /// Converts a string.
+ /// \param[in] Source source string
+ /// \param[out] Result container for converted string
+ /// \return error code in case something went wrong
+ std::error_code convert(StringRef Source,
+ SmallVectorImpl<char> &Result) const {
+ return Converter->convert(Source, Result);
+ }
+
+ ErrorOr<std::string> convert(StringRef Source) const {
+ SmallString<100> Result;
+ auto EC = Converter->convert(Source, Result);
+ if (!EC)
+ return std::string(Result);
+ return EC;
+ }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index df1e65f3a588c..9a7d26a35bf1a 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -162,6 +162,7 @@ add_llvm_component_library(LLVMSupport
CachePruning.cpp
Caching.cpp
circular_raw_ostream.cpp
+ CharSet.cpp
Chrono.cpp
COM.cpp
CodeGenCoverage.cpp
@@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport
Demangle
)
+# Link ICU library if it is an external library.
+if(ICU_FOUND)
+ target_link_libraries(LLVMSupport
+ PRIVATE
+ ${ICU_LIBRARIES}
+ )
+endif()
+
set(llvm_system_libs ${system_libs})
# This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 0000000000000..6810cf9c6e376
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,344 @@
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+static void normalizeCharSetName(StringRef CSName,
+ SmallVectorImpl<char> &Normalized) {
+ bool PrevDigit = false;
+ for (auto Ch : CSName) {
+ if (isAlnum(Ch)) {
+ Ch = toLower(Ch);
+ if (Ch != '0' || PrevDigit) {
+ PrevDigit = isDigit(Ch);
+ Normalized.push_back(Ch);
+ }
+ }
+ }
+}
+
+// Maps the charset name to enum constant if possible.
+static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+ SmallString<16> Normalized;
+ normalizeCharSetName(CSName, Normalized);
+ if (Normalized.equals("utf8"))
+ return text_encoding::id::UTF8;
+ if (Normalized.equals("ibm1047"))
+ return text_encoding::id::IBM1047;
+ return std::nullopt;
+}
+
+LLVM_ATTRIBUTE_UNUSED static void
+HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+ SmallVectorImpl<char> &Result) {
+ // No space left in output buffer. Double the size of the underlying
+ // memory in the SmallVectorImpl, adjust pointer and length and continue
+ // the conversion.
+ Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+ ? 2 * Capacity
+ : std::numeric_limits<size_t>::max();
+ Result.resize(0);
+ Result.resize_for_overwrite(Capacity);
+ Output = static_cast<char *>(Result.data());
+ OutputLength = Capacity;
+}
+
+namespace {
+enum ConversionType {
+ UTF8ToIBM1047,
+ IBM1047ToUTF8,
+};
+
+// Support conversion between EBCDIC 1047 and UTF-8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+ const ConversionType ConvType;
+
+public:
+ CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override;
+
+ void reset() override {}
+};
+
+std::error_code
+CharSetConverterTable::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
+ if (ConvType == IBM1047ToUTF8) {
+ ConverterEBCDIC::convertToUTF8(Source, Result);
+ return std::error_code();
+ } else if (ConvType == UTF8ToIBM1047) {
+ return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+ }
+ llvm_unreachable("Invalid ConvType!");
+ return std::error_code();
+}
+
+#ifdef HAVE_ICU
+struct UConverterDeleter {
+ void operator()(UConverter *Converter) const {
+ if (Converter)
+ ucnv_close(Converter);
+ }
+};
+using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
+
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+ UConverterUniquePtr FromConvDesc;
+ UConverterUniquePtr ToConvDesc;
+
+public:
+ CharSetConverterICU(UConverterUniquePtr FromConverter,
+ UConverterUniquePtr ToConverter)
+ : FromConvDesc(std::move(FromConverter)),
+ ToConvDesc(std::move(ToConverter)) {}
+
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override;
+
+ void reset() override;
+};
+
+std::error_code
+CharSetConverterICU::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
+ // Setup the input in case it has no backing data.
+ size_t InputLength = Source.size();
+ const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
+
+ // Setup the output. We directly write into the SmallVector.
+ size_t Capacity = Result.capacity();
+ size_t OutputLength = Capacity;
+ Result.resize_for_overwrite(Capacity);
+ char *Output = static_cast<char *>(Result.data());
+ UErrorCode EC = U_ZERO_ERROR;
+
+ ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+ &EC);
+ ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
+ NULL, &EC);
+ assert(U_SUCCESS(EC));
+
+ do {
+ EC = U_ZERO_ERROR;
+ const char *Input = In;
+
+ Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+ ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
+ In + InputLength, /*pivotStart=*/NULL,
+ /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+ /*pivotLimit=*/NULL, /*reset=*/true,
+ /*flush=*/true, &EC);
+ if (U_FAILURE(EC)) {
+ if (EC == U_BUFFER_OVERFLOW_ERROR &&
+ Capacity < std::numeric_limits<size_t>::max()) {
+ HandleOverflow(Capacity, Output, OutputLength, Result);
+ continue;
+ }
+ // Some other error occured.
+ Result.resize(Output - Result.data());
+ return std::error_code(EILSEQ, std::generic_category());
+ }
+ break;
+ } while (true);
+
+ Result.resize(Output - Result.data());
+ return std::error_code();
+}
+
+void CharSetConverterICU::reset() {
+ ucnv_reset(&*FromConvDesc);
+ ucnv_reset(&*ToConvDesc);
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+ class UniqueIconvT {
+ iconv_t ConvDesc;
+
+ public:
+ operator iconv_t() const { return ConvDesc; }
+ UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
+ ~UniqueIconvT() {
+ if (ConvDesc != (iconv_t)-1) {
+ iconv_close(ConvDesc);
+ ConvDesc = (iconv_t)-1;
+ }
+ }
+ UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
+ Other.ConvDesc = (iconv_t)-1;
+ }
+ UniqueIconvT &operator=(UniqueIconvT &&Other) {
+ if (&Other != this) {
+ ConvDesc = Other.ConvDesc;
+ Other.ConvDesc = (iconv_t)-1;
+ }
+ return *this;
+ }
+ };
+ UniqueIconvT ConvDesc;
+
+public:
+ CharSetConverterIconv(UniqueIconvT ConvDesc)
+ : ConvDesc(std::move(ConvDesc)) {}
+
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override;
+
+ void reset() override;
+};
+
+std::error_code
+CharSetConverterIconv::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
+ // Setup the output. We directly write into the SmallVector.
+ size_t Capacity = Result.capacity();
+ char *Output = static_cast<char *>(Result.data());
+ size_t OutputLength = Capacity;
+ Result.resize_for_overwrite(Capacity);
+
+ size_t Ret;
+ // Handle errors returned from iconv().
+ auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
+ this](size_t Ret) {
+ if (Ret == static_cast<size_t>(-1)) {
+ // An error occured. Check if we can gracefully handle it.
+ if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+ HandleOverflow(Capacity, Output, OutputLength, Result);
+ // Reset converter
+ iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+ return std::error_code();
+ } else {
+ // Some other error occured.
+ Result.resize(Output - Result.data());
+ return std::error_code(errno, std::generic_category());
+ }
+ } else {
+ // A positive return value indicates that some characters were converted
+ // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+ // an error in this case makes sure that both conversion routines behave
+ // in the same way.
+ return std::make_error_code(std::errc::illegal_byte_sequence);
+ }
+ };
+
+ do {
+ // Setup the input. Use nullptr to reset iconv state if input length is
+ // zero.
+ size_t InputLength = Source.size();
+ char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+ Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
+ if (Ret != 0) {
+ if (auto EC = HandleError(Ret))
+ return EC;
+ continue;
+ }
+ // Flush the converter
+ Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
+ if (Ret != 0) {
+ if (auto EC = HandleError(Ret))
+ return EC;
+ continue;
+ }
+ break;
+ } while (true);
+
+ // Re-adjust size to actual size.
+ Result.resize(Output - Result.data());
+ return std::error_code();
+}
+
+void CharSetConverterIconv::reset() {
+ iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+}
+
+#endif // HAVE_ICONV
+} // namespace
+
+ErrorOr<CharSetConverter> CharSetConverter::create(text_encoding::id CPFrom,
+ text_encoding::id CPTo) {
+
+ assert(CPFrom != CPTo && "Text encodings should be distinct");
+
+ ConversionType Conversion;
+ if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+ Conversion = UTF8ToIBM1047;
+ else if (CPFrom == text_encoding::id::IBM1047 &&
+ CPTo == text_encoding::id::UTF8)
+ Conversion = IBM1047ToUTF8;
+ else
+ return std::error_code(errno, std::generic_category());
+
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterTable>(Conversion);
+ return CharSetConverter(std::move(Converter));
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+ StringRef CSTo) {
+ std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
+ std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+ if (From && To) {
+ ErrorOr<CharSetConverter> Converter = create(*From, *To);
+ if (Converter)
+ return Converter;
+ }
+#ifdef HAVE_ICU
+ UErrorCode EC = U_ZERO_ERROR;
+ UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
+ if (U_FAILURE(EC)) {
+ return std::error_code(errno, std::generic_category());
+ }
+ UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC));
+ if (U_FAILURE(EC)) {
+ return std::error_code(errno, std::generic_category());
+ }
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
+ std::move(ToConvDesc));
+ return CharSetConverter(std::move(Converter));
+#elif defined(HAVE_ICONV)
+ iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+ if (ConvDesc == (iconv_t)-1)
+ return std::error_code(errno, std::generic_category());
+ std::unique_ptr<details::CharSetConverterImplBase> Converter =
+ std::make_unique<CharSetConverterIconv>(ConvDesc);
+ return CharSetConverter(std::move(Converter));
+#else
+ return std::make_error_code(std::errc::invalid_argument);
+#endif
+}
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index b6b9398df5e2e..09e55f116f780 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_unittest(SupportTests
CrashRecoveryTest.cpp
Caching.cpp
Casting.cpp
+ CharSetTest.cpp
CheckedArithmeticTest.cpp
Chrono.cpp
CommandLineTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
new file mode 100644
index 0000000000000..772d46ec73497
--- /dev/null
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,232 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+ "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+ "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+ "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+ "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+ "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+ "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+ "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë"
+static const char AccentUTF[] =
+ "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+ "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+ "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+ "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+// String with Cyrillic character ya.
+static const char CyrillicUTF[] = "\xd0\xaf";
+
+// String "Earth地球".
+// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and
+// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII.
+// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts
+// back.
+static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83";
+static const char EarthISO2022[] =
+ "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42";
+static const char EarthIBM939[] =
+ "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f";
+static const char EarthUTFExtraPartial[] =
+ "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
+
+TEST(CharSet, FromUTF8) {
+ // Hello string.
+ StringRef Src(HelloA);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
+ text_encoding::id::UTF8, text_encoding::id::IBM1047);
+
+ // Stop test if conversion is not supported.
+ if (!Conv) {
+ ASSERT_EQ(Conv.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ std::error_code EC = Conv->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // ABC string.
+ Src = ABCStrA;
+ EC = Conv->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // Accent string.
+ Src = AccentUTF;
+ EC = Conv->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // Cyrillic string. Results in error because not representable in 1047.
+ Src = CyrillicUTF;
+ EC = Conv->convert(Src, Dst);
+ EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
+}
+
+TEST(CharSet, ToUTF8) {
+ // Hello string.
+ StringRef Src(HelloE);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
+ text_encoding::id::IBM1047, text_encoding::id::UTF8);
+
+ // Stop test if conversion is not supported.
+ if (!Conv) {
+ ASSERT_EQ(Conv.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ std::error_code EC = Conv->convert(Src, Dst);
+
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // ABC string.
+ Src = ABCStrE;
+ EC = Conv->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+ Dst.clear();
+
+ // Accent string.
+ Src = AccentE;
+ EC = Conv->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+ ErrorOr<CharSetConverter> ConvToUTF16 =
+ CharSetConverter::create("IBM-1047", "UTF-16");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToUTF16) {
+ ASSERT_EQ(ConvToUTF16.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+ ErrorOr<CharSetConverter> ConvToUTF32 =
+ CharSetConverter::create("UTF-16", "UTF-32");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToUTF32) {
+ ASSERT_EQ(ConvToUTF32.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+ ErrorOr<CharSetConverter> ConvToEBCDIC =
+ CharSetConverter::create("UTF-32", "IBM-1047");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToEBCDIC) {
+ ASSERT_EQ(ConvToEBCDIC.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Setup source string.
+ char SrcStr[256];
+ for (size_t I = 0; I < 256; ++I)
+ SrcStr[I] = (I + 1) % 256;
+
+ SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+ std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str);
+ EXPECT_TRUE(!EC);
+ EC = ConvToUTF32->convert(Dst1Str, Dst2Str);
+ EXPECT_TRUE(!EC);
+ EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+TEST(CharSet, ShiftState2022) {
+ // Earth string.
+ StringRef Src(EarthUTF);
+ SmallString<8> Dst;
+
+ ErrorOr<CharSetConverter> ConvTo2022 =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvTo2022) {
+ ASSERT_EQ(ConvTo2022.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvTo2022->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ShiftState2022Partial) {
+ // Earth string.
+ StringRef Src(EarthUTFExtraPartial);
+ SmallString<8> Dst;
+
+ ErrorOr<CharSetConverter> ConvTo2022 =
+ CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvTo2022) {
+ ASSERT_EQ(ConvTo2022.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvTo2022->convert(Src, Dst);
+ EXPECT_TRUE(EC);
+}
+
+TEST(CharSet, ShiftStateIBM939) {
+ // Earth string.
+ StringRef Src(EarthUTF);
+ SmallString<64> Dst;
+
+ ErrorOr<CharSetConverter> ConvToIBM939 =
+ CharSetConverter::create("UTF-8", "IBM-939");
+ // Stop test if conversion is not supported (no underlying iconv support).
+ if (!ConvToIBM939) {
+ ASSERT_EQ(ConvToIBM939.getError(),
+ std::make_error_code(std::errc::invalid_argument));
+ return;
+ }
+
+ // Check that the string is properly converted.
+ std::error_code EC = ConvToIBM939->convert(Src, Dst);
+ EXPECT_TRUE(!EC);
+ EXPECT_STREQ(EarthIBM939, static_cast<std::string>(Dst).c_str());
+}
+
+} // namespace
diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp
index eec76879ac92c..557f29c391f9c 100644
--- a/llvm/unittests/Support/ConvertEBCDICTest.cpp
+++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp
@@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
// String with Cyrillic character ya.
static const char CyrillicUTF[] = "\xd0\xaf";
-TEST(CharSet, FromUTF8) {
+TEST(ConverterEBCDIC, convertToEBCDIC) {
// Hello string.
StringRef Src(HelloA);
SmallString<64> Dst;
@@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) {
Dst.clear();
}
-TEST(CharSet, ToUTF8) {
+TEST(ConverterEBCDIC, convertFromEBCDIC) {
// Hello string.
StringRef Src(HelloE);
SmallString<64> Dst;
>From 6d40922368d2d0acd511f923791604a149f75667 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 9 May 2025 10:06:18 -0400
Subject: [PATCH 2/5] address comments
---
llvm/include/llvm/Config/config.h.cmake | 4 +-
llvm/include/llvm/Support/CharSet.h | 8 ++--
llvm/lib/Support/CharSet.cpp | 59 ++++++++++++++-----------
llvm/unittests/Support/CharSetTest.cpp | 4 +-
4 files changed, 40 insertions(+), 35 deletions(-)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 3f70a0150da4f..06d4756397911 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -237,10 +237,10 @@
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
/* Define if ICU library is available */
-#cmakedefine HAVE_ICU ${HAVE_ICU}
+#cmakedefine01 HAVE_ICU
/* Define if iconv library is available */
-#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+#cmakedefine01 HAVE_ICONV
/* Linker version detected at compile time. */
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 6a28cd19f4143..8bb5baceccc20 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -69,15 +69,13 @@ class CharSetConverterImplBase {
} // namespace details
// Names inspired by https://wg21.link/p1885.
-namespace text_encoding {
-enum class id {
+enum class TextEncoding {
/// UTF-8 character set encoding.
UTF8,
/// IBM EBCDIC 1047 character set encoding.
IBM1047
};
-} // end namespace text_encoding
/// Utility class to convert between different character set encodings.
class CharSetConverter {
@@ -93,8 +91,8 @@ class CharSetConverter {
/// \param[in] CSFrom the source character encoding
/// \param[in] CSTo the target character encoding
/// \return a CharSetConverter instance or an error code
- static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom,
- text_encoding::id CSTo);
+ static ErrorOr<CharSetConverter> create(TextEncoding CSFrom,
+ TextEncoding CSTo);
/// Creates a CharSetConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 6810cf9c6e376..2c0b1ad67813c 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -22,9 +22,9 @@
#include <limits>
#include <system_error>
-#ifdef HAVE_ICU
+#if HAVE_ICU
#include <unicode/ucnv.h>
-#elif defined(HAVE_ICONV)
+#elif HAVE_ICONV
#include <iconv.h>
#endif
@@ -47,13 +47,13 @@ static void normalizeCharSetName(StringRef CSName,
}
// Maps the charset name to enum constant if possible.
-static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+static std::optional<TextEncoding> getKnownCharSet(StringRef CSName) {
SmallString<16> Normalized;
normalizeCharSetName(CSName, Normalized);
if (Normalized.equals("utf8"))
- return text_encoding::id::UTF8;
+ return TextEncoding::UTF8;
if (Normalized.equals("ibm1047"))
- return text_encoding::id::IBM1047;
+ return TextEncoding::IBM1047;
return std::nullopt;
}
@@ -98,17 +98,18 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
std::error_code
CharSetConverterTable::convertString(StringRef Source,
SmallVectorImpl<char> &Result) {
- if (ConvType == IBM1047ToUTF8) {
+ switch (ConvType) {
+ case IBM1047ToUTF8:
ConverterEBCDIC::convertToUTF8(Source, Result);
return std::error_code();
- } else if (ConvType == UTF8ToIBM1047) {
+ case UTF8ToIBM1047:
return ConverterEBCDIC::convertToEBCDIC(Source, Result);
}
llvm_unreachable("Invalid ConvType!");
return std::error_code();
}
-#ifdef HAVE_ICU
+#if HAVE_ICU
struct UConverterDeleter {
void operator()(UConverter *Converter) const {
if (Converter)
@@ -133,6 +134,10 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
void reset() override;
};
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and redo the conversion for the remaining string.
std::error_code
CharSetConverterICU::convertString(StringRef Source,
SmallVectorImpl<char> &Result) {
@@ -144,7 +149,7 @@ CharSetConverterICU::convertString(StringRef Source,
size_t Capacity = Result.capacity();
size_t OutputLength = Capacity;
Result.resize_for_overwrite(Capacity);
- char *Output = static_cast<char *>(Result.data());
+ char *Output;
UErrorCode EC = U_ZERO_ERROR;
ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
@@ -185,7 +190,7 @@ void CharSetConverterICU::reset() {
ucnv_reset(&*ToConvDesc);
}
-#elif defined(HAVE_ICONV)
+#elif HAVE_ICONV
class CharSetConverterIconv : public details::CharSetConverterImplBase {
class UniqueIconvT {
iconv_t ConvDesc;
@@ -222,6 +227,10 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
void reset() override;
};
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and redo the conversion for the remaining string.
std::error_code
CharSetConverterIconv::convertString(StringRef Source,
SmallVectorImpl<char> &Result) {
@@ -289,35 +298,35 @@ void CharSetConverterIconv::reset() {
#endif // HAVE_ICONV
} // namespace
-ErrorOr<CharSetConverter> CharSetConverter::create(text_encoding::id CPFrom,
- text_encoding::id CPTo) {
+ErrorOr<CharSetConverter> CharSetConverter::create(TextEncoding CPFrom,
+ TextEncoding CPTo) {
- assert(CPFrom != CPTo && "Text encodings should be distinct");
+ // text encodings should be distinct
+ if(CPFrom == CPTo)
+ return std::make_error_code(std::errc::invalid_argument);
ConversionType Conversion;
- if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047)
+ if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
Conversion = UTF8ToIBM1047;
- else if (CPFrom == text_encoding::id::IBM1047 &&
- CPTo == text_encoding::id::UTF8)
+ else if (CPFrom == TextEncoding::IBM1047 &&
+ CPTo == TextEncoding::UTF8)
Conversion = IBM1047ToUTF8;
else
return std::error_code(errno, std::generic_category());
- std::unique_ptr<details::CharSetConverterImplBase> Converter =
- std::make_unique<CharSetConverterTable>(Conversion);
- return CharSetConverter(std::move(Converter));
+ return CharSetConverter(std::make_unique<CharSetConverterTable>(Conversion));
}
ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
StringRef CSTo) {
- std::optional<text_encoding::id> From = getKnownCharSet(CSFrom);
- std::optional<text_encoding::id> To = getKnownCharSet(CSTo);
+ std::optional<TextEncoding> From = getKnownCharSet(CSFrom);
+ std::optional<TextEncoding> To = getKnownCharSet(CSTo);
if (From && To) {
ErrorOr<CharSetConverter> Converter = create(*From, *To);
if (Converter)
return Converter;
}
-#ifdef HAVE_ICU
+#if HAVE_ICU
UErrorCode EC = U_ZERO_ERROR;
UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
if (U_FAILURE(EC)) {
@@ -331,13 +340,11 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
std::move(ToConvDesc));
return CharSetConverter(std::move(Converter));
-#elif defined(HAVE_ICONV)
+#elif HAVE_ICONV
iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
if (ConvDesc == (iconv_t)-1)
return std::error_code(errno, std::generic_category());
- std::unique_ptr<details::CharSetConverterImplBase> Converter =
- std::make_unique<CharSetConverterIconv>(ConvDesc);
- return CharSetConverter(std::move(Converter));
+ return CharSetConverter(std::make_unique<CharSetConverterIconv>(ConvDesc));
#else
return std::make_error_code(std::errc::invalid_argument);
#endif
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index 772d46ec73497..eeaf24acda225 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -59,7 +59,7 @@ TEST(CharSet, FromUTF8) {
SmallString<64> Dst;
ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
- text_encoding::id::UTF8, text_encoding::id::IBM1047);
+ TextEncoding::UTF8, TextEncoding::IBM1047);
// Stop test if conversion is not supported.
if (!Conv) {
@@ -99,7 +99,7 @@ TEST(CharSet, ToUTF8) {
SmallString<64> Dst;
ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
- text_encoding::id::IBM1047, text_encoding::id::UTF8);
+ TextEncoding::IBM1047, TextEncoding::UTF8);
// Stop test if conversion is not supported.
if (!Conv) {
>From 52635f214877a2d46eee5f78d29ee1b5c97150b9 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 9 May 2025 10:38:39 -0400
Subject: [PATCH 3/5] rename CharSetConverter to EncodingConverter
---
llvm/include/llvm/Support/CharSet.h | 35 +++++++-------
llvm/lib/Support/CharSet.cpp | 64 +++++++++++++-------------
llvm/unittests/Support/CharSetTest.cpp | 32 ++++++-------
3 files changed, 66 insertions(+), 65 deletions(-)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
index 8bb5baceccc20..22263a60a1a1f 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -28,7 +28,7 @@ namespace llvm {
template <typename T> class SmallVectorImpl;
namespace details {
-class CharSetConverterImplBase {
+class EncodingConverterImplBase {
private:
/// Converts a string.
@@ -57,7 +57,7 @@ class CharSetConverterImplBase {
virtual void reset() = 0;
public:
- virtual ~CharSetConverterImplBase() = default;
+ virtual ~EncodingConverterImplBase() = default;
/// Converts a string and resets the converter to the initial state.
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
@@ -78,43 +78,44 @@ enum class TextEncoding {
};
/// Utility class to convert between different character set encodings.
-class CharSetConverter {
- std::unique_ptr<details::CharSetConverterImplBase> Converter;
+class EncodingConverter {
+ std::unique_ptr<details::EncodingConverterImplBase> Converter;
- CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+ EncodingConverter(
+ std::unique_ptr<details::EncodingConverterImplBase> Converter)
: Converter(std::move(Converter)) {}
public:
- /// Creates a CharSetConverter instance.
+ /// Creates a EncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] CSFrom the source character encoding
/// \param[in] CSTo the target character encoding
- /// \return a CharSetConverter instance or an error code
- static ErrorOr<CharSetConverter> create(TextEncoding CSFrom,
- TextEncoding CSTo);
+ /// \return a EncodingConverter instance or an error code
+ static ErrorOr<EncodingConverter> create(TextEncoding CSFrom,
+ TextEncoding CSTo);
- /// Creates a CharSetConverter instance.
+ /// Creates a EncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] CPFrom name of the source character encoding
/// \param[in] CPTo name of the target character encoding
- /// \return a CharSetConverter instance or an error code
- static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+ /// \return a EncodingConverter instance or an error code
+ static ErrorOr<EncodingConverter> create(StringRef CPFrom, StringRef CPTo);
- CharSetConverter(const CharSetConverter &) = delete;
- CharSetConverter &operator=(const CharSetConverter &) = delete;
+ EncodingConverter(const EncodingConverter &) = delete;
+ EncodingConverter &operator=(const EncodingConverter &) = delete;
- CharSetConverter(CharSetConverter &&Other)
+ EncodingConverter(EncodingConverter &&Other)
: Converter(std::move(Other.Converter)) {}
- CharSetConverter &operator=(CharSetConverter &&Other) {
+ EncodingConverter &operator=(EncodingConverter &&Other) {
if (this != &Other)
Converter = std::move(Other.Converter);
return *this;
}
- ~CharSetConverter() = default;
+ ~EncodingConverter() = default;
/// Converts a string.
/// \param[in] Source source string
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
index 2c0b1ad67813c..ef5123eef26a0 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/CharSet.cpp
@@ -47,7 +47,7 @@ static void normalizeCharSetName(StringRef CSName,
}
// Maps the charset name to enum constant if possible.
-static std::optional<TextEncoding> getKnownCharSet(StringRef CSName) {
+static std::optional<TextEncoding> getKnownEncoding(StringRef CSName) {
SmallString<16> Normalized;
normalizeCharSetName(CSName, Normalized);
if (Normalized.equals("utf8"))
@@ -83,11 +83,11 @@ enum ConversionType {
// aforementioned character sets. The use of tables for conversion is only
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
// character sets are not supported.
-class CharSetConverterTable : public details::CharSetConverterImplBase {
+class EncodingConverterTable : public details::EncodingConverterImplBase {
const ConversionType ConvType;
public:
- CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+ EncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
std::error_code convertString(StringRef Source,
SmallVectorImpl<char> &Result) override;
@@ -96,8 +96,8 @@ class CharSetConverterTable : public details::CharSetConverterImplBase {
};
std::error_code
-CharSetConverterTable::convertString(StringRef Source,
- SmallVectorImpl<char> &Result) {
+EncodingConverterTable::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
switch (ConvType) {
case IBM1047ToUTF8:
ConverterEBCDIC::convertToUTF8(Source, Result);
@@ -118,13 +118,13 @@ struct UConverterDeleter {
};
using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
-class CharSetConverterICU : public details::CharSetConverterImplBase {
+class EncodingConverterICU : public details::EncodingConverterImplBase {
UConverterUniquePtr FromConvDesc;
UConverterUniquePtr ToConvDesc;
public:
- CharSetConverterICU(UConverterUniquePtr FromConverter,
- UConverterUniquePtr ToConverter)
+ EncodingConverterICU(UConverterUniquePtr FromConverter,
+ UConverterUniquePtr ToConverter)
: FromConvDesc(std::move(FromConverter)),
ToConvDesc(std::move(ToConverter)) {}
@@ -139,8 +139,8 @@ class CharSetConverterICU : public details::CharSetConverterImplBase {
// insufficient buffer size. In the future, it would better to save the partial
// result and redo the conversion for the remaining string.
std::error_code
-CharSetConverterICU::convertString(StringRef Source,
- SmallVectorImpl<char> &Result) {
+EncodingConverterICU::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
// Setup the input in case it has no backing data.
size_t InputLength = Source.size();
const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
@@ -185,13 +185,13 @@ CharSetConverterICU::convertString(StringRef Source,
return std::error_code();
}
-void CharSetConverterICU::reset() {
+void EncodingConverterICU::reset() {
ucnv_reset(&*FromConvDesc);
ucnv_reset(&*ToConvDesc);
}
#elif HAVE_ICONV
-class CharSetConverterIconv : public details::CharSetConverterImplBase {
+class EncodingConverterIconv : public details::EncodingConverterImplBase {
class UniqueIconvT {
iconv_t ConvDesc;
@@ -218,7 +218,7 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
UniqueIconvT ConvDesc;
public:
- CharSetConverterIconv(UniqueIconvT ConvDesc)
+ EncodingConverterIconv(UniqueIconvT ConvDesc)
: ConvDesc(std::move(ConvDesc)) {}
std::error_code convertString(StringRef Source,
@@ -232,8 +232,8 @@ class CharSetConverterIconv : public details::CharSetConverterImplBase {
// insufficient buffer size. In the future, it would better to save the partial
// result and redo the conversion for the remaining string.
std::error_code
-CharSetConverterIconv::convertString(StringRef Source,
- SmallVectorImpl<char> &Result) {
+EncodingConverterIconv::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
// Setup the output. We directly write into the SmallVector.
size_t Capacity = Result.capacity();
char *Output = static_cast<char *>(Result.data());
@@ -291,38 +291,38 @@ CharSetConverterIconv::convertString(StringRef Source,
return std::error_code();
}
-void CharSetConverterIconv::reset() {
+void EncodingConverterIconv::reset() {
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
}
#endif // HAVE_ICONV
} // namespace
-ErrorOr<CharSetConverter> CharSetConverter::create(TextEncoding CPFrom,
- TextEncoding CPTo) {
+ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
+ TextEncoding CPTo) {
// text encodings should be distinct
- if(CPFrom == CPTo)
+ if (CPFrom == CPTo)
return std::make_error_code(std::errc::invalid_argument);
ConversionType Conversion;
if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
Conversion = UTF8ToIBM1047;
- else if (CPFrom == TextEncoding::IBM1047 &&
- CPTo == TextEncoding::UTF8)
+ else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
Conversion = IBM1047ToUTF8;
else
return std::error_code(errno, std::generic_category());
- return CharSetConverter(std::make_unique<CharSetConverterTable>(Conversion));
+ return EncodingConverter(
+ std::make_unique<EncodingConverterTable>(Conversion));
}
-ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
- StringRef CSTo) {
- std::optional<TextEncoding> From = getKnownCharSet(CSFrom);
- std::optional<TextEncoding> To = getKnownCharSet(CSTo);
+ErrorOr<EncodingConverter> EncodingConverter::create(StringRef CSFrom,
+ StringRef CSTo) {
+ std::optional<TextEncoding> From = getKnownEncoding(CSFrom);
+ std::optional<TextEncoding> To = getKnownEncoding(CSTo);
if (From && To) {
- ErrorOr<CharSetConverter> Converter = create(*From, *To);
+ ErrorOr<EncodingConverter> Converter = create(*From, *To);
if (Converter)
return Converter;
}
@@ -336,15 +336,15 @@ ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
if (U_FAILURE(EC)) {
return std::error_code(errno, std::generic_category());
}
- std::unique_ptr<details::CharSetConverterImplBase> Converter =
- std::make_unique<CharSetConverterICU>(std::move(FromConvDesc),
- std::move(ToConvDesc));
- return CharSetConverter(std::move(Converter));
+ std::unique_ptr<details::EncodingConverterImplBase> Converter =
+ std::make_unique<EncodingConverterICU>(std::move(FromConvDesc),
+ std::move(ToConvDesc));
+ return EncodingConverter(std::move(Converter));
#elif HAVE_ICONV
iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
if (ConvDesc == (iconv_t)-1)
return std::error_code(errno, std::generic_category());
- return CharSetConverter(std::make_unique<CharSetConverterIconv>(ConvDesc));
+ return EncodingConverter(std::make_unique<EncodingConverterIconv>(ConvDesc));
#else
return std::make_error_code(std::errc::invalid_argument);
#endif
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp
index eeaf24acda225..77e5b3064e1e9 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/CharSetTest.cpp
@@ -58,8 +58,8 @@ TEST(CharSet, FromUTF8) {
StringRef Src(HelloA);
SmallString<64> Dst;
- ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
- TextEncoding::UTF8, TextEncoding::IBM1047);
+ ErrorOr<EncodingConverter> Conv =
+ EncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
// Stop test if conversion is not supported.
if (!Conv) {
@@ -98,8 +98,8 @@ TEST(CharSet, ToUTF8) {
StringRef Src(HelloE);
SmallString<64> Dst;
- ErrorOr<CharSetConverter> Conv = CharSetConverter::create(
- TextEncoding::IBM1047, TextEncoding::UTF8);
+ ErrorOr<EncodingConverter> Conv =
+ EncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
// Stop test if conversion is not supported.
if (!Conv) {
@@ -129,24 +129,24 @@ TEST(CharSet, ToUTF8) {
}
TEST(CharSet, RoundTrip) {
- ErrorOr<CharSetConverter> ConvToUTF16 =
- CharSetConverter::create("IBM-1047", "UTF-16");
+ ErrorOr<EncodingConverter> ConvToUTF16 =
+ EncodingConverter::create("IBM-1047", "UTF-16");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToUTF16) {
ASSERT_EQ(ConvToUTF16.getError(),
std::make_error_code(std::errc::invalid_argument));
return;
}
- ErrorOr<CharSetConverter> ConvToUTF32 =
- CharSetConverter::create("UTF-16", "UTF-32");
+ ErrorOr<EncodingConverter> ConvToUTF32 =
+ EncodingConverter::create("UTF-16", "UTF-32");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToUTF32) {
ASSERT_EQ(ConvToUTF32.getError(),
std::make_error_code(std::errc::invalid_argument));
return;
}
- ErrorOr<CharSetConverter> ConvToEBCDIC =
- CharSetConverter::create("UTF-32", "IBM-1047");
+ ErrorOr<EncodingConverter> ConvToEBCDIC =
+ EncodingConverter::create("UTF-32", "IBM-1047");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToEBCDIC) {
ASSERT_EQ(ConvToEBCDIC.getError(),
@@ -175,8 +175,8 @@ TEST(CharSet, ShiftState2022) {
StringRef Src(EarthUTF);
SmallString<8> Dst;
- ErrorOr<CharSetConverter> ConvTo2022 =
- CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ ErrorOr<EncodingConverter> ConvTo2022 =
+ EncodingConverter::create("UTF-8", "ISO-2022-JP");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvTo2022) {
ASSERT_EQ(ConvTo2022.getError(),
@@ -195,8 +195,8 @@ TEST(CharSet, ShiftState2022Partial) {
StringRef Src(EarthUTFExtraPartial);
SmallString<8> Dst;
- ErrorOr<CharSetConverter> ConvTo2022 =
- CharSetConverter::create("UTF-8", "ISO-2022-JP");
+ ErrorOr<EncodingConverter> ConvTo2022 =
+ EncodingConverter::create("UTF-8", "ISO-2022-JP");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvTo2022) {
ASSERT_EQ(ConvTo2022.getError(),
@@ -214,8 +214,8 @@ TEST(CharSet, ShiftStateIBM939) {
StringRef Src(EarthUTF);
SmallString<64> Dst;
- ErrorOr<CharSetConverter> ConvToIBM939 =
- CharSetConverter::create("UTF-8", "IBM-939");
+ ErrorOr<EncodingConverter> ConvToIBM939 =
+ EncodingConverter::create("UTF-8", "IBM-939");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToIBM939) {
ASSERT_EQ(ConvToIBM939.getError(),
>From a39b13ee1b47f5d1b539b546402f1f15e3827ab6 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 12 May 2025 09:00:21 -0400
Subject: [PATCH 4/5] address comments, rename CharSet to EncodingConverter
---
.../{CharSet.h => EncodingConverter.h} | 25 +++++-----
llvm/lib/Support/CMakeLists.txt | 2 +-
.../{CharSet.cpp => EncodingConverter.cpp} | 46 +++++++++----------
llvm/unittests/Support/CMakeLists.txt | 2 +-
...rSetTest.cpp => EncodingConverterTest.cpp} | 18 ++++----
5 files changed, 45 insertions(+), 48 deletions(-)
rename llvm/include/llvm/Support/{CharSet.h => EncodingConverter.h} (83%)
rename llvm/lib/Support/{CharSet.cpp => EncodingConverter.cpp} (88%)
rename llvm/unittests/Support/{CharSetTest.cpp => EncodingConverterTest.cpp} (95%)
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/EncodingConverter.h
similarity index 83%
rename from llvm/include/llvm/Support/CharSet.h
rename to llvm/include/llvm/Support/EncodingConverter.h
index 22263a60a1a1f..6ceb7f7f547de 100644
--- a/llvm/include/llvm/Support/CharSet.h
+++ b/llvm/include/llvm/Support/EncodingConverter.h
@@ -1,4 +1,4 @@
-//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
+//===-- EncodingConverter.h - Encoding conversion class -----------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,8 +12,8 @@
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_SUPPORT_CHARSET_H
-#define LLVM_SUPPORT_CHARSET_H
+#ifndef LLVM_SUPPORT_ENCODING_CONVERTER_H
+#define LLVM_SUPPORT_ENCODING_CONVERTER_H
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
@@ -44,8 +44,8 @@ class EncodingConverterImplBase {
/// - std::errc::invalid_argument: The input contains an incomplete
/// multibyte sequence.
///
- /// If the destination charset is a stateful character set, the shift state
- /// will be set to the initial state.
+ /// If the destination encoding is stateful, the shift state will be set
+ /// to the initial state.
///
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
@@ -77,7 +77,7 @@ enum class TextEncoding {
IBM1047
};
-/// Utility class to convert between different character set encodings.
+/// Utility class to convert between different character encodings.
class EncodingConverter {
std::unique_ptr<details::EncodingConverterImplBase> Converter;
@@ -89,19 +89,18 @@ class EncodingConverter {
/// Creates a EncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
- /// \param[in] CSFrom the source character encoding
- /// \param[in] CSTo the target character encoding
+ /// \param[in] From the source character encoding
+ /// \param[in] To the target character encoding
/// \return a EncodingConverter instance or an error code
- static ErrorOr<EncodingConverter> create(TextEncoding CSFrom,
- TextEncoding CSTo);
+ static ErrorOr<EncodingConverter> create(TextEncoding From, TextEncoding To);
/// Creates a EncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
- /// \param[in] CPFrom name of the source character encoding
- /// \param[in] CPTo name of the target character encoding
+ /// \param[in] From name of the source character encoding
+ /// \param[in] To name of the target character encoding
/// \return a EncodingConverter instance or an error code
- static ErrorOr<EncodingConverter> create(StringRef CPFrom, StringRef CPTo);
+ static ErrorOr<EncodingConverter> create(StringRef From, StringRef To);
EncodingConverter(const EncodingConverter &) = delete;
EncodingConverter &operator=(const EncodingConverter &) = delete;
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 9a7d26a35bf1a..64c25148faa01 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -162,7 +162,6 @@ add_llvm_component_library(LLVMSupport
CachePruning.cpp
Caching.cpp
circular_raw_ostream.cpp
- CharSet.cpp
Chrono.cpp
COM.cpp
CodeGenCoverage.cpp
@@ -187,6 +186,7 @@ add_llvm_component_library(LLVMSupport
ELFAttributes.cpp
ELFAttrParserCompact.cpp
ELFAttrParserExtended.cpp
+ EncodingConverter.cpp
Error.cpp
ErrorHandling.cpp
ExponentialBackoff.cpp
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/EncodingConverter.cpp
similarity index 88%
rename from llvm/lib/Support/CharSet.cpp
rename to llvm/lib/Support/EncodingConverter.cpp
index ef5123eef26a0..838fc89b6df95 100644
--- a/llvm/lib/Support/CharSet.cpp
+++ b/llvm/lib/Support/EncodingConverter.cpp
@@ -1,4 +1,4 @@
-//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
+//===-- EncodingConverter.cpp - Encoding conversion class ---------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -8,11 +8,11 @@
///
/// \file
/// This file provides utility classes to convert between different character
-/// set encodings.
+/// encodings.
///
//===----------------------------------------------------------------------===//
-#include "llvm/Support/CharSet.h"
+#include "llvm/Support/EncodingConverter.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -46,10 +46,10 @@ static void normalizeCharSetName(StringRef CSName,
}
}
-// Maps the charset name to enum constant if possible.
-static std::optional<TextEncoding> getKnownEncoding(StringRef CSName) {
+// Maps the encoding name to enum constant if possible.
+static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
SmallString<16> Normalized;
- normalizeCharSetName(CSName, Normalized);
+ normalizeCharSetName(Name, Normalized);
if (Normalized.equals("utf8"))
return TextEncoding::UTF8;
if (Normalized.equals("ibm1047"))
@@ -63,9 +63,8 @@ HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
// No space left in output buffer. Double the size of the underlying
// memory in the SmallVectorImpl, adjust pointer and length and continue
// the conversion.
- Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
- ? 2 * Capacity
- : std::numeric_limits<size_t>::max();
+ Capacity =
+ (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
Result.resize(0);
Result.resize_for_overwrite(Capacity);
Output = static_cast<char *>(Result.data());
@@ -80,9 +79,9 @@ enum ConversionType {
// Support conversion between EBCDIC 1047 and UTF-8. This class uses
// built-in translation tables that allow for translation between the
-// aforementioned character sets. The use of tables for conversion is only
+// aforementioned encodings. The use of tables for conversion is only
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
-// character sets are not supported.
+// encodings are not supported.
class EncodingConverterTable : public details::EncodingConverterImplBase {
const ConversionType ConvType;
@@ -169,8 +168,7 @@ EncodingConverterICU::convertString(StringRef Source,
/*pivotLimit=*/NULL, /*reset=*/true,
/*flush=*/true, &EC);
if (U_FAILURE(EC)) {
- if (EC == U_BUFFER_OVERFLOW_ERROR &&
- Capacity < std::numeric_limits<size_t>::max()) {
+ if (EC == U_BUFFER_OVERFLOW_ERROR && Capacity < Result.max_size()) {
HandleOverflow(Capacity, Output, OutputLength, Result);
continue;
}
@@ -246,7 +244,7 @@ EncodingConverterIconv::convertString(StringRef Source,
this](size_t Ret) {
if (Ret == static_cast<size_t>(-1)) {
// An error occured. Check if we can gracefully handle it.
- if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+ if (errno == E2BIG && Capacity < Result.max_size()) {
HandleOverflow(Capacity, Output, OutputLength, Result);
// Reset converter
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
@@ -301,7 +299,7 @@ void EncodingConverterIconv::reset() {
ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
TextEncoding CPTo) {
- // text encodings should be distinct
+ // Text encodings should be distinct.
if (CPFrom == CPTo)
return std::make_error_code(std::errc::invalid_argument);
@@ -317,22 +315,22 @@ ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
std::make_unique<EncodingConverterTable>(Conversion));
}
-ErrorOr<EncodingConverter> EncodingConverter::create(StringRef CSFrom,
- StringRef CSTo) {
- std::optional<TextEncoding> From = getKnownEncoding(CSFrom);
- std::optional<TextEncoding> To = getKnownEncoding(CSTo);
- if (From && To) {
- ErrorOr<EncodingConverter> Converter = create(*From, *To);
+ErrorOr<EncodingConverter> EncodingConverter::create(StringRef From,
+ StringRef To) {
+ std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
+ std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
+ if (FromEncoding && ToEncoding) {
+ ErrorOr<EncodingConverter> Converter = create(*FromEncoding, *ToEncoding);
if (Converter)
return Converter;
}
#if HAVE_ICU
UErrorCode EC = U_ZERO_ERROR;
- UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC));
+ UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
if (U_FAILURE(EC)) {
return std::error_code(errno, std::generic_category());
}
- UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC));
+ UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
if (U_FAILURE(EC)) {
return std::error_code(errno, std::generic_category());
}
@@ -341,7 +339,7 @@ ErrorOr<EncodingConverter> EncodingConverter::create(StringRef CSFrom,
std::move(ToConvDesc));
return EncodingConverter(std::move(Converter));
#elif HAVE_ICONV
- iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+ iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
if (ConvDesc == (iconv_t)-1)
return std::error_code(errno, std::generic_category());
return EncodingConverter(std::make_unique<EncodingConverterIconv>(ConvDesc));
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 09e55f116f780..083c77a037d0f 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_unittest(SupportTests
CrashRecoveryTest.cpp
Caching.cpp
Casting.cpp
- CharSetTest.cpp
CheckedArithmeticTest.cpp
Chrono.cpp
CommandLineTest.cpp
@@ -40,6 +39,7 @@ add_llvm_unittest(SupportTests
ErrnoTest.cpp
ErrorOrTest.cpp
ErrorTest.cpp
+ EncodingConverterTest.cpp
ExponentialBackoffTest.cpp
ExtensibleRTTITest.cpp
FileCollectorTest.cpp
diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/EncodingConverterTest.cpp
similarity index 95%
rename from llvm/unittests/Support/CharSetTest.cpp
rename to llvm/unittests/Support/EncodingConverterTest.cpp
index 77e5b3064e1e9..9e6853a30d14d 100644
--- a/llvm/unittests/Support/CharSetTest.cpp
+++ b/llvm/unittests/Support/EncodingConverterTest.cpp
@@ -1,4 +1,4 @@
-//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Support/CharSet.h"
+#include "llvm/Support/EncodingConverter.h"
#include "llvm/ADT/SmallString.h"
#include "gtest/gtest.h"
using namespace llvm;
@@ -53,7 +53,7 @@ static const char EarthIBM939[] =
static const char EarthUTFExtraPartial[] =
"\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5";
-TEST(CharSet, FromUTF8) {
+TEST(Encoding, FromUTF8) {
// Hello string.
StringRef Src(HelloA);
SmallString<64> Dst;
@@ -93,7 +93,7 @@ TEST(CharSet, FromUTF8) {
EXPECT_EQ(EC, std::errc::illegal_byte_sequence);
}
-TEST(CharSet, ToUTF8) {
+TEST(Encoding, ToUTF8) {
// Hello string.
StringRef Src(HelloE);
SmallString<64> Dst;
@@ -128,7 +128,7 @@ TEST(CharSet, ToUTF8) {
EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
}
-TEST(CharSet, RoundTrip) {
+TEST(Encoding, RoundTrip) {
ErrorOr<EncodingConverter> ConvToUTF16 =
EncodingConverter::create("IBM-1047", "UTF-16");
// Stop test if conversion is not supported (no underlying iconv support).
@@ -170,7 +170,7 @@ TEST(CharSet, RoundTrip) {
EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
}
-TEST(CharSet, ShiftState2022) {
+TEST(Encoding, ShiftState2022) {
// Earth string.
StringRef Src(EarthUTF);
SmallString<8> Dst;
@@ -190,7 +190,7 @@ TEST(CharSet, ShiftState2022) {
EXPECT_STREQ(EarthISO2022, static_cast<std::string>(Dst).c_str());
}
-TEST(CharSet, ShiftState2022Partial) {
+TEST(Encoding, InvalidInput) {
// Earth string.
StringRef Src(EarthUTFExtraPartial);
SmallString<8> Dst;
@@ -204,12 +204,12 @@ TEST(CharSet, ShiftState2022Partial) {
return;
}
- // Check that the string is properly converted.
+ // Check that the string failed to convert.
std::error_code EC = ConvTo2022->convert(Src, Dst);
EXPECT_TRUE(EC);
}
-TEST(CharSet, ShiftStateIBM939) {
+TEST(Encoding, ShiftStateIBM939) {
// Earth string.
StringRef Src(EarthUTF);
SmallString<64> Dst;
>From b32b472b6f6c01355bee447a98bcf453ac55e061 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 15 May 2025 09:08:40 -0400
Subject: [PATCH 5/5] rename filename, class to use TextEncoding, address
comments
---
llvm/CMakeLists.txt | 4 +-
llvm/cmake/config-ix.cmake | 2 +-
.../{EncodingConverter.h => TextEncoding.h} | 37 +++++------
llvm/lib/Support/CMakeLists.txt | 2 +-
...EncodingConverter.cpp => TextEncoding.cpp} | 62 ++++++++++---------
llvm/unittests/Support/CMakeLists.txt | 2 +-
...ConverterTest.cpp => TextEncodingTest.cpp} | 34 +++++-----
7 files changed, 74 insertions(+), 69 deletions(-)
rename llvm/include/llvm/Support/{EncodingConverter.h => TextEncoding.h} (76%)
rename llvm/lib/Support/{EncodingConverter.cpp => TextEncoding.cpp} (84%)
rename llvm/unittests/Support/{EncodingConverterTest.cpp => TextEncodingTest.cpp} (88%)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 894c0e1d2e5ae..09f488fc45513 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -592,9 +592,9 @@ else()
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
endif()
-set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
-set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
+set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index f7e826b34d26f..9d59fea8799b1 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -314,7 +314,7 @@ if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
endif()
-# Check for builtin iconv to avoid licensing issues.
+# Check only for builtin iconv to avoid licensing issues.
if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
find_package(Iconv REQUIRED)
diff --git a/llvm/include/llvm/Support/EncodingConverter.h b/llvm/include/llvm/Support/TextEncoding.h
similarity index 76%
rename from llvm/include/llvm/Support/EncodingConverter.h
rename to llvm/include/llvm/Support/TextEncoding.h
index 6ceb7f7f547de..fd457e5482bbe 100644
--- a/llvm/include/llvm/Support/EncodingConverter.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -1,4 +1,4 @@
-//===-- EncodingConverter.h - Encoding conversion class -----------*- C++ -*-=//
+//===-- TextEncodingConverter.h - Encoding conversion class -------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -28,7 +28,7 @@ namespace llvm {
template <typename T> class SmallVectorImpl;
namespace details {
-class EncodingConverterImplBase {
+class TextEncodingConverterImplBase {
private:
/// Converts a string.
@@ -57,7 +57,7 @@ class EncodingConverterImplBase {
virtual void reset() = 0;
public:
- virtual ~EncodingConverterImplBase() = default;
+ virtual ~TextEncodingConverterImplBase() = default;
/// Converts a string and resets the converter to the initial state.
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
@@ -78,43 +78,44 @@ enum class TextEncoding {
};
/// Utility class to convert between different character encodings.
-class EncodingConverter {
- std::unique_ptr<details::EncodingConverterImplBase> Converter;
+class TextEncodingConverter {
+ std::unique_ptr<details::TextEncodingConverterImplBase> Converter;
- EncodingConverter(
- std::unique_ptr<details::EncodingConverterImplBase> Converter)
+ TextEncodingConverter(
+ std::unique_ptr<details::TextEncodingConverterImplBase> Converter)
: Converter(std::move(Converter)) {}
public:
- /// Creates a EncodingConverter instance.
+ /// Creates a TextEncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] From the source character encoding
/// \param[in] To the target character encoding
- /// \return a EncodingConverter instance or an error code
- static ErrorOr<EncodingConverter> create(TextEncoding From, TextEncoding To);
+ /// \return a TextEncodingConverter instance or an error code
+ static ErrorOr<TextEncodingConverter> create(TextEncoding From,
+ TextEncoding To);
- /// Creates a EncodingConverter instance.
+ /// Creates a TextEncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] From name of the source character encoding
/// \param[in] To name of the target character encoding
- /// \return a EncodingConverter instance or an error code
- static ErrorOr<EncodingConverter> create(StringRef From, StringRef To);
+ /// \return a TextEncodingConverter instance or an error code
+ static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To);
- EncodingConverter(const EncodingConverter &) = delete;
- EncodingConverter &operator=(const EncodingConverter &) = delete;
+ TextEncodingConverter(const TextEncodingConverter &) = delete;
+ TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;
- EncodingConverter(EncodingConverter &&Other)
+ TextEncodingConverter(TextEncodingConverter &&Other)
: Converter(std::move(Other.Converter)) {}
- EncodingConverter &operator=(EncodingConverter &&Other) {
+ TextEncodingConverter &operator=(TextEncodingConverter &&Other) {
if (this != &Other)
Converter = std::move(Other.Converter);
return *this;
}
- ~EncodingConverter() = default;
+ ~TextEncodingConverter() = default;
/// Converts a string.
/// \param[in] Source source string
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 64c25148faa01..09e93f5a2ca7d 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -186,7 +186,6 @@ add_llvm_component_library(LLVMSupport
ELFAttributes.cpp
ELFAttrParserCompact.cpp
ELFAttrParserExtended.cpp
- EncodingConverter.cpp
Error.cpp
ErrorHandling.cpp
ExponentialBackoff.cpp
@@ -258,6 +257,7 @@ add_llvm_component_library(LLVMSupport
SuffixTree.cpp
SystemUtils.cpp
TarWriter.cpp
+ TextEncoding.cpp
ThreadPool.cpp
TimeProfiler.cpp
Timer.cpp
diff --git a/llvm/lib/Support/EncodingConverter.cpp b/llvm/lib/Support/TextEncoding.cpp
similarity index 84%
rename from llvm/lib/Support/EncodingConverter.cpp
rename to llvm/lib/Support/TextEncoding.cpp
index 838fc89b6df95..6f02b6e2e6d43 100644
--- a/llvm/lib/Support/EncodingConverter.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -1,4 +1,4 @@
-//===-- EncodingConverter.cpp - Encoding conversion class ---------*- C++ -*-=//
+//===-- TextEncoding.cpp - Encoding conversion class --------------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
///
//===----------------------------------------------------------------------===//
-#include "llvm/Support/EncodingConverter.h"
+#include "llvm/Support/TextEncoding.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -82,11 +82,12 @@ enum ConversionType {
// aforementioned encodings. The use of tables for conversion is only
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
// encodings are not supported.
-class EncodingConverterTable : public details::EncodingConverterImplBase {
+class TextEncodingConverterTable
+ : public details::TextEncodingConverterImplBase {
const ConversionType ConvType;
public:
- EncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+ TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
std::error_code convertString(StringRef Source,
SmallVectorImpl<char> &Result) override;
@@ -95,8 +96,8 @@ class EncodingConverterTable : public details::EncodingConverterImplBase {
};
std::error_code
-EncodingConverterTable::convertString(StringRef Source,
- SmallVectorImpl<char> &Result) {
+TextEncodingConverterTable::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
switch (ConvType) {
case IBM1047ToUTF8:
ConverterEBCDIC::convertToUTF8(Source, Result);
@@ -117,13 +118,13 @@ struct UConverterDeleter {
};
using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
-class EncodingConverterICU : public details::EncodingConverterImplBase {
+class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
UConverterUniquePtr FromConvDesc;
UConverterUniquePtr ToConvDesc;
public:
- EncodingConverterICU(UConverterUniquePtr FromConverter,
- UConverterUniquePtr ToConverter)
+ TextEncodingConverterICU(UConverterUniquePtr FromConverter,
+ UConverterUniquePtr ToConverter)
: FromConvDesc(std::move(FromConverter)),
ToConvDesc(std::move(ToConverter)) {}
@@ -138,8 +139,8 @@ class EncodingConverterICU : public details::EncodingConverterImplBase {
// insufficient buffer size. In the future, it would better to save the partial
// result and redo the conversion for the remaining string.
std::error_code
-EncodingConverterICU::convertString(StringRef Source,
- SmallVectorImpl<char> &Result) {
+TextEncodingConverterICU::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
// Setup the input in case it has no backing data.
size_t InputLength = Source.size();
const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
@@ -183,13 +184,14 @@ EncodingConverterICU::convertString(StringRef Source,
return std::error_code();
}
-void EncodingConverterICU::reset() {
+void TextEncodingConverterICU::reset() {
ucnv_reset(&*FromConvDesc);
ucnv_reset(&*ToConvDesc);
}
#elif HAVE_ICONV
-class EncodingConverterIconv : public details::EncodingConverterImplBase {
+class TextEncodingConverterIconv
+ : public details::TextEncodingConverterImplBase {
class UniqueIconvT {
iconv_t ConvDesc;
@@ -216,7 +218,7 @@ class EncodingConverterIconv : public details::EncodingConverterImplBase {
UniqueIconvT ConvDesc;
public:
- EncodingConverterIconv(UniqueIconvT ConvDesc)
+ TextEncodingConverterIconv(UniqueIconvT ConvDesc)
: ConvDesc(std::move(ConvDesc)) {}
std::error_code convertString(StringRef Source,
@@ -230,8 +232,8 @@ class EncodingConverterIconv : public details::EncodingConverterImplBase {
// insufficient buffer size. In the future, it would better to save the partial
// result and redo the conversion for the remaining string.
std::error_code
-EncodingConverterIconv::convertString(StringRef Source,
- SmallVectorImpl<char> &Result) {
+TextEncodingConverterIconv::convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) {
// Setup the output. We directly write into the SmallVector.
size_t Capacity = Result.capacity();
char *Output = static_cast<char *>(Result.data());
@@ -289,15 +291,15 @@ EncodingConverterIconv::convertString(StringRef Source,
return std::error_code();
}
-void EncodingConverterIconv::reset() {
+void TextEncodingConverterIconv::reset() {
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
}
#endif // HAVE_ICONV
} // namespace
-ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
- TextEncoding CPTo) {
+ErrorOr<TextEncodingConverter>
+TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
// Text encodings should be distinct.
if (CPFrom == CPTo)
@@ -311,16 +313,17 @@ ErrorOr<EncodingConverter> EncodingConverter::create(TextEncoding CPFrom,
else
return std::error_code(errno, std::generic_category());
- return EncodingConverter(
- std::make_unique<EncodingConverterTable>(Conversion));
+ return TextEncodingConverter(
+ std::make_unique<TextEncodingConverterTable>(Conversion));
}
-ErrorOr<EncodingConverter> EncodingConverter::create(StringRef From,
- StringRef To) {
+ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
+ StringRef To) {
std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
if (FromEncoding && ToEncoding) {
- ErrorOr<EncodingConverter> Converter = create(*FromEncoding, *ToEncoding);
+ ErrorOr<TextEncodingConverter> Converter =
+ create(*FromEncoding, *ToEncoding);
if (Converter)
return Converter;
}
@@ -334,15 +337,16 @@ ErrorOr<EncodingConverter> EncodingConverter::create(StringRef From,
if (U_FAILURE(EC)) {
return std::error_code(errno, std::generic_category());
}
- std::unique_ptr<details::EncodingConverterImplBase> Converter =
- std::make_unique<EncodingConverterICU>(std::move(FromConvDesc),
- std::move(ToConvDesc));
- return EncodingConverter(std::move(Converter));
+ std::unique_ptr<details::TextEncodingConverterImplBase> Converter =
+ std::make_unique<TextEncodingConverterICU>(std::move(FromConvDesc),
+ std::move(ToConvDesc));
+ return TextEncodingConverter(std::move(Converter));
#elif HAVE_ICONV
iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
if (ConvDesc == (iconv_t)-1)
return std::error_code(errno, std::generic_category());
- return EncodingConverter(std::make_unique<EncodingConverterIconv>(ConvDesc));
+ return TextEncodingConverter(
+ std::make_unique<TextEncodingConverterIconv>(ConvDesc));
#else
return std::make_error_code(std::errc::invalid_argument);
#endif
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 083c77a037d0f..d048e871fd0fb 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -39,7 +39,6 @@ add_llvm_unittest(SupportTests
ErrnoTest.cpp
ErrorOrTest.cpp
ErrorTest.cpp
- EncodingConverterTest.cpp
ExponentialBackoffTest.cpp
ExtensibleRTTITest.cpp
FileCollectorTest.cpp
@@ -89,6 +88,7 @@ add_llvm_unittest(SupportTests
SuffixTreeTest.cpp
SwapByteOrderTest.cpp
TarWriterTest.cpp
+ TextEncodingTest.cpp
ThreadPool.cpp
ThreadSafeAllocatorTest.cpp
Threading.cpp
diff --git a/llvm/unittests/Support/EncodingConverterTest.cpp b/llvm/unittests/Support/TextEncodingTest.cpp
similarity index 88%
rename from llvm/unittests/Support/EncodingConverterTest.cpp
rename to llvm/unittests/Support/TextEncodingTest.cpp
index 9e6853a30d14d..383dff12c64e4 100644
--- a/llvm/unittests/Support/EncodingConverterTest.cpp
+++ b/llvm/unittests/Support/TextEncodingTest.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Support/EncodingConverter.h"
+#include "llvm/Support/TextEncoding.h"
#include "llvm/ADT/SmallString.h"
#include "gtest/gtest.h"
using namespace llvm;
@@ -58,8 +58,8 @@ TEST(Encoding, FromUTF8) {
StringRef Src(HelloA);
SmallString<64> Dst;
- ErrorOr<EncodingConverter> Conv =
- EncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
+ ErrorOr<TextEncodingConverter> Conv =
+ TextEncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
// Stop test if conversion is not supported.
if (!Conv) {
@@ -98,8 +98,8 @@ TEST(Encoding, ToUTF8) {
StringRef Src(HelloE);
SmallString<64> Dst;
- ErrorOr<EncodingConverter> Conv =
- EncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
+ ErrorOr<TextEncodingConverter> Conv =
+ TextEncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
// Stop test if conversion is not supported.
if (!Conv) {
@@ -129,24 +129,24 @@ TEST(Encoding, ToUTF8) {
}
TEST(Encoding, RoundTrip) {
- ErrorOr<EncodingConverter> ConvToUTF16 =
- EncodingConverter::create("IBM-1047", "UTF-16");
+ ErrorOr<TextEncodingConverter> ConvToUTF16 =
+ TextEncodingConverter::create("IBM-1047", "UTF-16");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToUTF16) {
ASSERT_EQ(ConvToUTF16.getError(),
std::make_error_code(std::errc::invalid_argument));
return;
}
- ErrorOr<EncodingConverter> ConvToUTF32 =
- EncodingConverter::create("UTF-16", "UTF-32");
+ ErrorOr<TextEncodingConverter> ConvToUTF32 =
+ TextEncodingConverter::create("UTF-16", "UTF-32");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToUTF32) {
ASSERT_EQ(ConvToUTF32.getError(),
std::make_error_code(std::errc::invalid_argument));
return;
}
- ErrorOr<EncodingConverter> ConvToEBCDIC =
- EncodingConverter::create("UTF-32", "IBM-1047");
+ ErrorOr<TextEncodingConverter> ConvToEBCDIC =
+ TextEncodingConverter::create("UTF-32", "IBM-1047");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToEBCDIC) {
ASSERT_EQ(ConvToEBCDIC.getError(),
@@ -175,8 +175,8 @@ TEST(Encoding, ShiftState2022) {
StringRef Src(EarthUTF);
SmallString<8> Dst;
- ErrorOr<EncodingConverter> ConvTo2022 =
- EncodingConverter::create("UTF-8", "ISO-2022-JP");
+ ErrorOr<TextEncodingConverter> ConvTo2022 =
+ TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvTo2022) {
ASSERT_EQ(ConvTo2022.getError(),
@@ -195,8 +195,8 @@ TEST(Encoding, InvalidInput) {
StringRef Src(EarthUTFExtraPartial);
SmallString<8> Dst;
- ErrorOr<EncodingConverter> ConvTo2022 =
- EncodingConverter::create("UTF-8", "ISO-2022-JP");
+ ErrorOr<TextEncodingConverter> ConvTo2022 =
+ TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvTo2022) {
ASSERT_EQ(ConvTo2022.getError(),
@@ -214,8 +214,8 @@ TEST(Encoding, ShiftStateIBM939) {
StringRef Src(EarthUTF);
SmallString<64> Dst;
- ErrorOr<EncodingConverter> ConvToIBM939 =
- EncodingConverter::create("UTF-8", "IBM-939");
+ ErrorOr<TextEncodingConverter> ConvToIBM939 =
+ TextEncodingConverter::create("UTF-8", "IBM-939");
// Stop test if conversion is not supported (no underlying iconv support).
if (!ConvToIBM939) {
ASSERT_EQ(ConvToIBM939.getError(),
More information about the llvm-commits
mailing list