[llvm] Create a CharSetConverter class with both iconv and icu support (PR #74516)

Hubert Tong via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 17 21:46:32 PDT 2024


================
@@ -0,0 +1,318 @@
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+  if (Normalized.equals("utf8"))
+    return text_encoding::id::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return text_encoding::id::IBM1047;
+  return std::nullopt;
+}
+
+void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
+                    SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                 ? 2 * Capacity
+                 : std::numeric_limits<size_t>::max();
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
+  OutputLength = Capacity;
+}
+
+namespace {
+enum ConversionType {
+  UTFToIBM1047,
+  IBM1047ToUTF,
+};
+
+// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+  ConversionType ConvType;
+
+public:
+  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterTable::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  if (ConvType == IBM1047ToUTF) {
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  } else if (ConvType == UTFToIBM1047) {
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+std::error_code CharSetConverterTable::flush() const {
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#ifdef HAVE_ICU
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+  UConverter *FromConvDesc;
+  UConverter *ToConvDesc;
+
+public:
+  CharSetConverterICU(UConverter *Converter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC)) {
+      ToConvDesc = nullptr;
+    }
+  };
+
+  CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      ToConvDesc = nullptr;
+  }
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterICU::convert(StringRef Source,
+                                             SmallVectorImpl<char> &Result,
+                                             bool ShouldAutoFlush) const {
+  // Setup the output. We directly write into the SmallVector.
+  Result.resize_for_overwrite(Source.size());
+  size_t OutputLength, Capacity = Result.capacity();
+  char *Output, *Out;
+
+  UErrorCode EC = U_ZERO_ERROR;
+
+  do {
+    EC = U_ZERO_ERROR;
+    size_t InputLength = Source.size();
+    const char *Input =
+        InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    const char *In = Input;
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    OutputLength = Capacity;
+    Out = Output;
+    Result.resize_for_overwrite(Capacity);
+    ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
+                   &Input, In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true,
+                   /*flush=*/ShouldAutoFlush, &EC);
+    if (U_FAILURE(EC)) {
+      if (EC == U_BUFFER_OVERFLOW_ERROR &&
+          Capacity < std::numeric_limits<size_t>::max())
+        HandleOverflow(Capacity, Output, OutputLength, Result);
+      else
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+    } else if (U_SUCCESS(EC))
+      break;
+  } while (U_FAILURE(EC));
+
+  Result.resize(Output - Out);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
+
+std::error_code
+CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
----------------
hubert-reinterpretcast wrote:

Flushing is not necessarily automatic on conversion (as implied by there being a `flush` parameter to `ucnv_convertEx`). This function is supposed to enable the user of the interface to retrieve the necessary shift sequence to return the output stream to the initial shift state. It does not appear to do so.

https://github.com/llvm/llvm-project/pull/74516


More information about the llvm-commits mailing list