[llvm] Create a EncodingConverter class with both iconv and icu support. (PR #138893)

Mon May 19 20:11:16 PDT 2025

================
@@ -0,0 +1,353 @@
+//===-- TextEncoding.cpp - Encoding conversion class --------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TextEncoding.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#if HAVE_ICU
+#include <unicode/ucnv.h>
+#elif HAVE_ICONV
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the encoding name to enum constant if possible.
+static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(Name, Normalized);
+  if (Normalized.equals("utf8"))
+    return TextEncoding::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return TextEncoding::IBM1047;
+  return std::nullopt;
+}
+
+LLVM_ATTRIBUTE_UNUSED static void
+HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+               SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity =
+      (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
+  Result.resize(0);
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
+namespace {
+enum ConversionType {
+  UTF8ToIBM1047,
+  IBM1047ToUTF8,
+};
+
+// Support conversion between EBCDIC 1047 and UTF-8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned encodings. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// encodings are not supported.
+class TextEncodingConverterTable
+    : public details::TextEncodingConverterImplBase {
+  const ConversionType ConvType;
+
+public:
+  TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override {}
+};
+
+std::error_code
+TextEncodingConverterTable::convertString(StringRef Source,
+                                          SmallVectorImpl<char> &Result) {
+  switch (ConvType) {
+  case IBM1047ToUTF8:
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  case UTF8ToIBM1047:
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+#if HAVE_ICU
+struct UConverterDeleter {
+  void operator()(UConverter *Converter) const {
+    if (Converter)
+      ucnv_close(Converter);
+  }
+};
+using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
+
+class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
+  UConverterUniquePtr FromConvDesc;
+  UConverterUniquePtr ToConvDesc;
+
+public:
+  TextEncodingConverterICU(UConverterUniquePtr FromConverter,
+                           UConverterUniquePtr ToConverter)
+      : FromConvDesc(std::move(FromConverter)),
+        ToConvDesc(std::move(ToConverter)) {}
+
+  std::error_code convertString(StringRef Source,
+                                SmallVectorImpl<char> &Result) override;
+
+  void reset() override;
+};
+
+// TODO: The current implementation discards the partial result and restarts the
+// conversion from the beginning if there is a conversion error due to
+// insufficient buffer size. In the future, it would better to save the partial
+// result and redo the conversion for the remaining string.
+std::error_code
+TextEncodingConverterICU::convertString(StringRef Source,
+                                        SmallVectorImpl<char> &Result) {
+  // Setup the input in case it has no backing data.
+  size_t InputLength = Source.size();
+  const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
+
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  size_t OutputLength = Capacity;
+  Result.resize_for_overwrite(Capacity);
+  char *Output;
+  UErrorCode EC = U_ZERO_ERROR;
+
+  ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
+                      &EC);
+  ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
+                        NULL, &EC);
+  assert(U_SUCCESS(EC));
----------------
hubert-reinterpretcast wrote:

Should this be added to `reset` (and should the constructor call `reset`) instead of having it here?

https://github.com/llvm/llvm-project/pull/138893