[llvm] Create a CharSetConverter class with both iconv and icu support (PR #74516)

Hubert Tong via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 26 09:54:47 PDT 2024


================
@@ -0,0 +1,284 @@
+//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+static void normalizeCharSetName(StringRef CSName,
+                                 SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+static std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+  if (Normalized.equals("utf8"))
+    return text_encoding::id::UTF8;
+  if (Normalized.equals("ibm1047"))
+    return text_encoding::id::IBM1047;
+  return std::nullopt;
+}
+
+static void HandleOverflow(size_t &Capacity, char *&Output,
+                           size_t &OutputLength,
+                           SmallVectorImpl<char> &Result) {
+  // No space left in output buffer. Double the size of the underlying
+  // memory in the SmallVectorImpl, adjust pointer and length and continue
+  // the conversion.
+  Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                 ? 2 * Capacity
+                 : std::numeric_limits<size_t>::max();
+  Result.resize_for_overwrite(Capacity);
+  Output = static_cast<char *>(Result.data());
----------------
hubert-reinterpretcast wrote:

If we are going to ignore the previous contents, then we might as well not copy them.
```suggestion
  Result.resize(0);
  Result.resize_for_overwrite(Capacity);
  Output = static_cast<char *>(Result.data());
```

https://github.com/llvm/llvm-project/pull/74516


More information about the llvm-commits mailing list