[llvm] r211015 - Support/ConvertUTF: implement U+FFFD insertion according to the recommendation

NAKAMURA Takumi geek4civic at gmail.com
Mon Jun 16 08:35:57 PDT 2014


2014-06-16 20:09 GMT+09:00 Dmitri Gribenko <gribozavr at gmail.com>:
> Author: gribozavr
> Date: Mon Jun 16 06:09:46 2014
> New Revision: 211015
>
> URL: http://llvm.org/viewvc/llvm-project?rev=211015&view=rev
> Log:
> Support/ConvertUTF: implement U+FFFD insertion according to the recommendation
> given in the Unicode spec
>
> That is, replace every maximal subpart of an ill-formed subsequence with one
> U+FFFD.
>
> Modified:
>     llvm/trunk/include/llvm/Support/ConvertUTF.h
>     llvm/trunk/lib/Support/ConvertUTF.c
>     llvm/trunk/unittests/Support/ConvertUTFTest.cpp
>
> Modified: llvm/trunk/include/llvm/Support/ConvertUTF.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/ConvertUTF.h?rev=211015&r1=211014&r2=211015&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/Support/ConvertUTF.h (original)
> +++ llvm/trunk/include/llvm/Support/ConvertUTF.h Mon Jun 16 06:09:46 2014
> @@ -136,7 +136,19 @@ ConversionResult ConvertUTF8toUTF16 (
>    const UTF8** sourceStart, const UTF8* sourceEnd,
>    UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
>
> -ConversionResult ConvertUTF8toUTF32 (
> +/**
> + * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
> + * incomplete code unit sequence, returns \c sourceExhausted.
> + */
> +ConversionResult ConvertUTF8toUTF32Partial(
> +  const UTF8** sourceStart, const UTF8* sourceEnd,
> +  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
> +
> +/**
> + * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
> + * incomplete code unit sequence, returns \c sourceIllegal.
> + */
> +ConversionResult ConvertUTF8toUTF32(
>    const UTF8** sourceStart, const UTF8* sourceEnd,
>    UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
>
>
> Modified: llvm/trunk/lib/Support/ConvertUTF.c
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/ConvertUTF.c?rev=211015&r1=211014&r2=211015&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Support/ConvertUTF.c (original)
> +++ llvm/trunk/lib/Support/ConvertUTF.c Mon Jun 16 06:09:46 2014
> @@ -51,6 +51,7 @@
>  #ifdef CVTUTF_DEBUG
>  #include <stdio.h>
>  #endif
> +#include <assert.h>
>
>  static const int halfShift  = 10; /* used for shifting by 10 bits */
>
> @@ -392,6 +393,97 @@ Boolean isLegalUTF8Sequence(const UTF8 *
>
>  /* --------------------------------------------------------------------- */
>
> +static unsigned
> +findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
> +                                          const UTF8 *sourceEnd) {
> +  assert(!isLegalUTF8Sequence(source, sourceEnd));
> +
> +  /*
> +   * Unicode 6.3.0, D93b:
> +   *
> +   *   Maximal subpart of an ill-formed subsequence: The longest code unit
> +   *   subsequence starting at an unconvertible offset that is either:
> +   *   a. the initial subsequence of a well-formed code unit sequence, or
> +   *   b. a subsequence of length one.
> +   */
> +
> +  if (source == sourceEnd)
> +    return 0;
> +
> +  /*
> +   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
> +   * Byte Sequences.
> +   */
> +
> +  UTF8 b1 = *source;
> +  ++source;
> +  if (b1 >= 0xC2 && b1 <= 0xDF) {
> +    /*
> +     * First byte is valid, but we know that this code unit sequence is
> +     * invalid, so the maximal subpart has to end after the first byte.
> +     */
> +    return 1;
> +  }
> +
> +  if (source == sourceEnd)
> +    return 1;
> +
> +  UTF8 b2 = *source;
> +  ++source;
> +
> +  if (b1 == 0xE0) {
> +    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
> +  }
> +  if (b1 >= 0xE1 && b1 <= 0xEC) {
> +    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
> +  }
> +  if (b1 == 0xED) {
> +    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
> +  }
> +  if (b1 >= 0xEE && b1 <= 0xEF) {
> +    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
> +  }
> +  if (b1 == 0xF0) {
> +    if (b2 >= 0x90 && b2 <= 0xBF) {
> +      if (source == sourceEnd)
> +        return 2;
> +
> +      UTF8 b3 = *source;
> +      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
> +    }
> +    return 1;
> +  }
> +  if (b1 >= 0xF1 && b1 <= 0xF3) {
> +    if (b2 >= 0x80 && b2 <= 0xBF) {
> +      if (source == sourceEnd)
> +        return 2;
> +
> +      UTF8 b3 = *source;
> +      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
> +    }
> +    return 1;
> +  }
> +  if (b1 == 0xF4) {
> +    if (b2 >= 0x80 && b2 <= 0x8F) {
> +      if (source == sourceEnd)
> +        return 2;
> +
> +      UTF8 b3 = *source;
> +      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
> +    }
> +    return 1;
> +  }
> +
> +  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
> +  /*
> +   * There are no valid sequences that start with these bytes.  Maximal subpart
> +   * is defined to have length 1 in these cases.
> +   */
> +  return 1;
> +}
> +
> +/* --------------------------------------------------------------------- */
> +
>  /*
>   * Exported function to return the total number of bytes in a codepoint
>   * represented in UTF-8, given the value of the first byte.
> @@ -491,9 +583,10 @@ ConversionResult ConvertUTF8toUTF16 (
>
>  /* --------------------------------------------------------------------- */
>
> -ConversionResult ConvertUTF8toUTF32 (
> +static ConversionResult ConvertUTF8toUTF32Impl(
>          const UTF8** sourceStart, const UTF8* sourceEnd,
> -        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
> +        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
> +        Boolean InputIsPartial) {
>      ConversionResult result = conversionOK;
>      const UTF8* source = *sourceStart;
>      UTF32* target = *targetStart;
> @@ -501,12 +594,42 @@ ConversionResult ConvertUTF8toUTF32 (
>          UTF32 ch = 0;
>          unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
>          if (extraBytesToRead >= sourceEnd - source) {
> -            result = sourceExhausted; break;
> +            if (flags == strictConversion || InputIsPartial) {
> +                result = sourceExhausted;
> +                break;
> +            } else {
> +                result = sourceIllegal;
> +
> +                /*
> +                 * Replace the maximal subpart of ill-formed sequence with
> +                 * replacement character.
> +                 */
> +                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
> +                                                                    sourceEnd);
> +                *target++ = UNI_REPLACEMENT_CHAR;
> +                continue;
> +            }
>          }
> +        if (target >= targetEnd) {
> +            result = targetExhausted; break;
> +        }
> +
>          /* Do this check whether lenient or strict */
>          if (!isLegalUTF8(source, extraBytesToRead+1)) {
>              result = sourceIllegal;
> -            break;
> +            if (flags == strictConversion) {
> +                /* Abort conversion. */
> +                break;
> +            } else {
> +                /*
> +                 * Replace the maximal subpart of ill-formed sequence with
> +                 * replacement character.
> +                 */
> +                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
> +                                                                    sourceEnd);
> +                *target++ = UNI_REPLACEMENT_CHAR;
> +                continue;
> +            }
>          }
>          /*
>           * The cases all fall through. See "Note A" below.
> @@ -521,10 +644,6 @@ ConversionResult ConvertUTF8toUTF32 (
>          }
>          ch -= offsetsFromUTF8[extraBytesToRead];
>
> -        if (target >= targetEnd) {
> -            source -= (extraBytesToRead+1); /* Back up the source pointer! */
> -            result = targetExhausted; break;
> -        }
>          if (ch <= UNI_MAX_LEGAL_UTF32) {
>              /*
>               * UTF-16 surrogate values are illegal in UTF-32, and anything
> @@ -551,6 +670,22 @@ ConversionResult ConvertUTF8toUTF32 (
>      return result;
>  }
>
> +ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
> +                                           const UTF8 *sourceEnd,
> +                                           UTF32 **targetStart,
> +                                           UTF32 *targetEnd,
> +                                           ConversionFlags flags) {
> +  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
> +                                flags, /*InputIsPartial=*/true);
> +}
> +
> +ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
> +                                    const UTF8 *sourceEnd, UTF32 **targetStart,
> +                                    UTF32 *targetEnd, ConversionFlags flags) {
> +  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
> +                                flags, /*InputIsPartial=*/false);
> +}
> +
>  /* ---------------------------------------------------------------------
>
>      Note A.
>
> Modified: llvm/trunk/unittests/Support/ConvertUTFTest.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/ConvertUTFTest.cpp?rev=211015&r1=211014&r2=211015&view=diff
> ==============================================================================
> --- llvm/trunk/unittests/Support/ConvertUTFTest.cpp (original)
> +++ llvm/trunk/unittests/Support/ConvertUTFTest.cpp Mon Jun 16 06:09:46 2014
> @@ -10,6 +10,7 @@
>  #include "llvm/Support/ConvertUTF.h"
>  #include "gtest/gtest.h"
>  #include <string>
> +#include <vector>
>
>  using namespace llvm;
>
> @@ -63,3 +64,1188 @@ TEST(ConvertUTFTest, HasUTF16BOM) {
>    HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
>    EXPECT_FALSE(HasBOM);
>  }
> +
> +std::pair<ConversionResult, std::vector<unsigned>>
> +ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
> +  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
> +
> +  const UTF8 *SourceNext = SourceStart;
> +  std::vector<UTF32> Decoded(S.size(), 0);
> +  UTF32 *TargetStart = Decoded.data();
> +
> +  auto Result =
> +      ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
> +                         Decoded.data() + Decoded.size(), lenientConversion);
> +
> +  Decoded.resize(TargetStart - Decoded.data());
> +
> +  return std::make_pair(Result, Decoded);
> +}
> +
> +#define R0(RESULT) std::make_pair(RESULT, std::vector<unsigned>{})
> +#define R(RESULT, ...) std::make_pair(RESULT, std::vector<unsigned>{ __VA_ARGS__ })

Could you avoid initializer list here?

> +
> +TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
> +
> +  //
> +  // 1-byte sequences
> +  //
> +
> +  // U+0041 LATIN CAPITAL LETTER A
> +  EXPECT_EQ(R(conversionOK, 0x0041),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x41"));
> +
> +  //
> +  // 2-byte sequences
> +  //
> +
> +  // U+0283 LATIN SMALL LETTER ESH
> +  EXPECT_EQ(R(conversionOK, 0x0283),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xca\x83"));
> +
> +  // U+03BA GREEK SMALL LETTER KAPPA
> +  // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
> +  // U+03C3 GREEK SMALL LETTER SIGMA
> +  // U+03BC GREEK SMALL LETTER MU
> +  // U+03B5 GREEK SMALL LETTER EPSILON
> +  EXPECT_EQ(R(conversionOK, 0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
> +
> +  //
> +  // 3-byte sequences
> +  //
> +
> +  // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
> +  // U+6587 CJK UNIFIED IDEOGRAPH-6587
> +  EXPECT_EQ(R(conversionOK, 0x4f8b, 0x6587),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe4\xbe\x8b\xe6\x96\x87"));
> +
> +  // U+D55C HANGUL SYLLABLE HAN
> +  // U+AE00 HANGUL SYLLABLE GEUL
> +  EXPECT_EQ(R(conversionOK, 0xd55c, 0xae00),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\x95\x9c\xea\xb8\x80"));
> +
> +  // U+1112 HANGUL CHOSEONG HIEUH
> +  // U+1161 HANGUL JUNGSEONG A
> +  // U+11AB HANGUL JONGSEONG NIEUN
> +  // U+1100 HANGUL CHOSEONG KIYEOK
> +  // U+1173 HANGUL JUNGSEONG EU
> +  // U+11AF HANGUL JONGSEONG RIEUL
> +  EXPECT_EQ(R(conversionOK, 0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
> +          "\xe1\x86\xaf"));
> +
> +  //
> +  // 4-byte sequences
> +  //
> +
> +  // U+E0100 VARIATION SELECTOR-17
> +  EXPECT_EQ(R(conversionOK, 0x000E0100),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xa0\x84\x80"));
> +
> +  //
> +  // First possible sequence of a certain length
> +  //
> +
> +  // U+0000 NULL
> +  EXPECT_EQ(R(conversionOK, 0x0000),
> +      ConvertUTF8ToUnicodeScalarsLenient(StringRef("\x00", 1)));
> +
> +  // U+0080 PADDING CHARACTER
> +  EXPECT_EQ(R(conversionOK, 0x0080),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc2\x80"));
> +
> +  // U+0800 SAMARITAN LETTER ALAF
> +  EXPECT_EQ(R(conversionOK, 0x0800),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\xa0\x80"));
> +
> +  // U+10000 LINEAR B SYLLABLE B008 A
> +  EXPECT_EQ(R(conversionOK, 0x10000),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x90\x80\x80"));
> +
> +  // U+200000 (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88\x80\x80\x80"));
> +
> +  // U+4000000 (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80\x80\x80\x80"));
> +
> +  //
> +  // Last possible sequence of a certain length
> +  //
> +
> +  // U+007F DELETE
> +  EXPECT_EQ(R(conversionOK, 0x007f),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x7f"));
> +
> +  // U+07FF (unassigned)
> +  EXPECT_EQ(R(conversionOK, 0x07ff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xdf\xbf"));
> +
> +  // U+FFFF (noncharacter)
> +  EXPECT_EQ(R(conversionOK, 0xffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbf"));
> +
> +  // U+1FFFFF (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf7\xbf\xbf\xbf"));
> +
> +  // U+3FFFFFF (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf\xbf\xbf\xbf"));
> +
> +  // U+7FFFFFFF (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf\xbf\xbf\xbf"));
> +
> +  //
> +  // Other boundary conditions
> +  //
> +
> +  // U+D7FF (unassigned)
> +  EXPECT_EQ(R(conversionOK, 0xd7ff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\x9f\xbf"));
> +
> +  // U+E000 (private use)
> +  EXPECT_EQ(R(conversionOK, 0xe000),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xee\x80\x80"));
> +
> +  // U+FFFD REPLACEMENT CHARACTER
> +  EXPECT_EQ(R(conversionOK, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbd"));
> +
> +  // U+10FFFF (noncharacter)
> +  EXPECT_EQ(R(conversionOK, 0x10ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf\xbf"));
> +
> +  // U+110000 (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x90\x80\x80"));
> +
> +  //
> +  // Unexpected continuation bytes
> +  //
> +
> +  // A sequence of unexpected continuation bytes that don't follow a first
> +  // byte, every byte is a maximal subpart.
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xbf\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf\x80\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf\x82\xbf\xaa"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xaa\xb0\xbb\xbf\xaa\xa0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
> +
> +  // All continuation bytes (0x80--0xbf).
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
> +          "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
> +          "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
> +          "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
> +
> +  //
> +  // Lonely start bytes
> +  //
> +
> +  // Start bytes of 2-byte sequences (0xc0--0xdf).
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
> +          "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
> +
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
> +          "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
> +          "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
> +          "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
> +
> +  // Start bytes of 3-byte sequences (0xe0--0xef).
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
> +
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
> +          "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
> +
> +  // Start bytes of 4-byte sequences (0xf0--0xf7).
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
> +
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
> +
> +  // Start bytes of 5-byte sequences (0xf8--0xfb).
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\xf9\xfa\xfb"));
> +
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
> +
> +  // Start bytes of 6-byte sequences (0xfc--0xfd).
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\xfd"));
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0x0020, 0xfffd, 0x0020),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x20\xfd\x20"));
> +
> +  //
> +  // Other bytes (0xc0--0xc1, 0xfe--0xff).
> +  //
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc1"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfe"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xff"));
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0\xc1\xfe\xff"));
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfe\xfe\xff\xff"));
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfe\x80\x80\x80\x80\x80"));
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xff\x80\x80\x80\x80\x80"));
> +
> +  EXPECT_EQ(R(sourceIllegal,
> +              0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
> +
> +  //
> +  // Sequences with one continuation byte missing
> +  //
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc2"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xdf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\xa0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe1\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xec\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\x9f"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xee\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x90\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf"));
> +
> +  // Overlong sequences with one trailing byte missing.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc1"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\x9f"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80\x80"));
> +
> +  // Sequences that represent surrogates with one trailing byte missing.
> +  // High surrogates
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xac"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf"));
> +  // Low surrogates
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xb0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xb4"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xbf"));
> +
> +  // Ill-formed 4-byte sequences.
> +  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+1100xx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x90\x80"));
> +  // U+13FBxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf5\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf6\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf7\x80\x80"));
> +  // U+1FFBxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf7\xbf\xbf"));
> +
> +  // Ill-formed 5-byte sequences.
> +  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+2000xx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\xbf\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf9\x80\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfa\x80\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\x80\x80\x80"));
> +  // U+3FFFFxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf\xbf\xbf"));
> +
> +  // Ill-formed 6-byte sequences.
> +  // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
> +  // U+40000xx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf\xbf\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80\x80\x80\x80"));
> +  // U+7FFFFFxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf\xbf\xbf"));
> +
> +  //
> +  // Sequences with two continuation bytes missing
> +  //
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x90"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f"));
> +
> +  // Overlong sequences with two trailing byte missing.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80"));
> +
> +  // Sequences that represent surrogates with two trailing bytes missing.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed"));
> +
> +  // Ill-formed 4-byte sequences.
> +  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+110yxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x90"));
> +  // U+13Fyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf5\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf6\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf7\x80"));
> +  // U+1FFyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf7\xbf"));
> +
> +  // Ill-formed 5-byte sequences.
> +  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+200yxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf9\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfa\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\x80\x80"));
> +  // U+3FFFyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf\xbf"));
> +
> +  // Ill-formed 6-byte sequences.
> +  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+4000yxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80\x80\x80"));
> +  // U+7FFFFyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf\xbf"));
> +
> +  //
> +  // Sequences with three continuation bytes missing
> +  //
> +
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4"));
> +
> +  // Broken overlong sequences.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80"));
> +
> +  // Ill-formed 4-byte sequences.
> +  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+14yyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf5"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf6"));
> +  // U+1Cyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf7"));
> +
> +  // Ill-formed 5-byte sequences.
> +  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+20yyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf9\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfa\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\x80"));
> +  // U+3FCyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf"));
> +
> +  // Ill-formed 6-byte sequences.
> +  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+400yyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80\x80"));
> +  // U+7FFCyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf"));
> +
> +  //
> +  // Sequences with four continuation bytes missing
> +  //
> +
> +  // Ill-formed 5-byte sequences.
> +  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+uzyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf9"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfa"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb"));
> +  // U+3zyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfb"));
> +
> +  // Broken overlong sequences.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80"));
> +
> +  // Ill-formed 6-byte sequences.
> +  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+uzzyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80"));
> +  // U+7Fzzyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf"));
> +
> +  //
> +  // Sequences with five continuation bytes missing
> +  //
> +
> +  // Ill-formed 6-byte sequences.
> +  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
> +  // U+uzzyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc"));
> +  // U+uuzzyyxx (invalid)
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfd"));
> +
> +  //
> +  // Consecutive sequences with trailing bytes missing
> +  //
> +
> +  EXPECT_EQ(R(sourceIllegal,
> +      0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd, /**/
> +      0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +      0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +      0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd, /**/
> +      0xfffd, 0xfffd, 0xfffd, 0xfffd,
> +      0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient(
> +          "\xc0" "\xe0\x80" "\xf0\x80\x80"
> +          "\xf8\x80\x80\x80"
> +          "\xfc\x80\x80\x80\x80"
> +          "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
> +          "\xfb\xbf\xbf\xbf"
> +          "\xfd\xbf\xbf\xbf\xbf"));
> +
> +
> +  //
> +  // Overlong UTF-8 sequences
> +  //
> +
> +  // U+002F SOLIDUS
> +  EXPECT_EQ(R(conversionOK, 0x002f),
> +      ConvertUTF8ToUnicodeScalarsLenient("\x2f"));
> +
> +  // Overlong sequences of the above.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0\xaf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\x80\xaf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80\x80\xaf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80\x80\xaf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80\x80\xaf"));
> +
> +  // U+0000 NULL
> +  EXPECT_EQ(R(conversionOK, 0x0000),
> +      ConvertUTF8ToUnicodeScalarsLenient(StringRef("\x00", 1)));
> +
> +  // Overlong sequences of the above.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80\x80\x80"));
> +
> +  // Other overlong sequences.
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc0\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc1\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xc1\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xe0\x9f\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f\x80\x80"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf8\x87\xbf\xbf\xbf"));
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xfc\x83\xbf\xbf\xbf\xbf"));
> +
> +  //
> +  // Isolated surrogates
> +  //
> +
> +  // Unicode 6.3.0:
> +  //
> +  //    D71.  High-surrogate code point: A Unicode code point in the range
> +  //    U+D800 to U+DBFF.
> +  //
> +  //    D73.  Low-surrogate code point: A Unicode code point in the range
> +  //    U+DC00 to U+DFFF.
> +
> +  // Note: U+E0100 is <DB40 DD00> in UTF16.
> +
> +  // High surrogates
> +
> +  // U+D800
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80"));
> +
> +  // U+DB40
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0"));
> +
> +  // U+DBFF
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf"));
> +
> +  // Low surrogates
> +
> +  // U+DC00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xb0\x80"));
> +
> +  // U+DD00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xb4\x80"));
> +
> +  // U+DFFF
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xbf\xbf"));
> +
> +  // Surrogate pairs
> +
> +  // U+D800 U+DC00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80\xed\xb0\x80"));
> +
> +  // U+D800 U+DD00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80\xed\xb4\x80"));
> +
> +  // U+D800 U+DFFF
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80\xed\xbf\xbf"));
> +
> +  // U+DB40 U+DC00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0\xed\xb0\x80"));
> +
> +  // U+DB40 U+DD00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0\xed\xb4\x80"));
> +
> +  // U+DB40 U+DFFF
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0\xed\xbf\xbf"));
> +
> +  // U+DBFF U+DC00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf\xed\xb0\x80"));
> +
> +  // U+DBFF U+DD00
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf\xed\xb4\x80"));
> +
> +  // U+DBFF U+DFFF
> +  EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf\xed\xbf\xbf"));
> +
> +  //
> +  // Noncharacters
> +  //
> +
> +  // Unicode 6.3.0:
> +  //
> +  //    D14.  Noncharacter: A code point that is permanently reserved for
> +  //    internal use and that should never be interchanged. Noncharacters
> +  //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
> +  //    and the values U+FDD0..U+FDEF.
> +
> +  // U+FFFE
> +  EXPECT_EQ(R(conversionOK, 0xfffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbe"));
> +
> +  // U+FFFF
> +  EXPECT_EQ(R(conversionOK, 0xffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbf"));
> +
> +  // U+1FFFE
> +  EXPECT_EQ(R(conversionOK, 0x1fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x9f\xbf\xbe"));
> +
> +  // U+1FFFF
> +  EXPECT_EQ(R(conversionOK, 0x1ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\x9f\xbf\xbf"));
> +
> +  // U+2FFFE
> +  EXPECT_EQ(R(conversionOK, 0x2fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xaf\xbf\xbe"));
> +
> +  // U+2FFFF
> +  EXPECT_EQ(R(conversionOK, 0x2ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xaf\xbf\xbf"));
> +
> +  // U+3FFFE
> +  EXPECT_EQ(R(conversionOK, 0x3fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf\xbf\xbe"));
> +
> +  // U+3FFFF
> +  EXPECT_EQ(R(conversionOK, 0x3ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf\xbf\xbf"));
> +
> +  // U+4FFFE
> +  EXPECT_EQ(R(conversionOK, 0x4fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\x8f\xbf\xbe"));
> +
> +  // U+4FFFF
> +  EXPECT_EQ(R(conversionOK, 0x4ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\x8f\xbf\xbf"));
> +
> +  // U+5FFFE
> +  EXPECT_EQ(R(conversionOK, 0x5fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\x9f\xbf\xbe"));
> +
> +  // U+5FFFF
> +  EXPECT_EQ(R(conversionOK, 0x5ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\x9f\xbf\xbf"));
> +
> +  // U+6FFFE
> +  EXPECT_EQ(R(conversionOK, 0x6fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\xaf\xbf\xbe"));
> +
> +  // U+6FFFF
> +  EXPECT_EQ(R(conversionOK, 0x6ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\xaf\xbf\xbf"));
> +
> +  // U+7FFFE
> +  EXPECT_EQ(R(conversionOK, 0x7fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\xbf\xbf\xbe"));
> +
> +  // U+7FFFF
> +  EXPECT_EQ(R(conversionOK, 0x7ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf1\xbf\xbf\xbf"));
> +
> +  // U+8FFFE
> +  EXPECT_EQ(R(conversionOK, 0x8fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\x8f\xbf\xbe"));
> +
> +  // U+8FFFF
> +  EXPECT_EQ(R(conversionOK, 0x8ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\x8f\xbf\xbf"));
> +
> +  // U+9FFFE
> +  EXPECT_EQ(R(conversionOK, 0x9fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\x9f\xbf\xbe"));
> +
> +  // U+9FFFF
> +  EXPECT_EQ(R(conversionOK, 0x9ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\x9f\xbf\xbf"));
> +
> +  // U+AFFFE
> +  EXPECT_EQ(R(conversionOK, 0xafffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\xaf\xbf\xbe"));
> +
> +  // U+AFFFF
> +  EXPECT_EQ(R(conversionOK, 0xaffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\xaf\xbf\xbf"));
> +
> +  // U+BFFFE
> +  EXPECT_EQ(R(conversionOK, 0xbfffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\xbf\xbf\xbe"));
> +
> +  // U+BFFFF
> +  EXPECT_EQ(R(conversionOK, 0xbffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf2\xbf\xbf\xbf"));
> +
> +  // U+CFFFE
> +  EXPECT_EQ(R(conversionOK, 0xcfffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\x8f\xbf\xbe"));
> +
> +  // U+CFFFF
> +  EXPECT_EQ(R(conversionOK, 0xcfffF),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\x8f\xbf\xbf"));
> +
> +  // U+DFFFE
> +  EXPECT_EQ(R(conversionOK, 0xdfffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\x9f\xbf\xbe"));
> +
> +  // U+DFFFF
> +  EXPECT_EQ(R(conversionOK, 0xdffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\x9f\xbf\xbf"));
> +
> +  // U+EFFFE
> +  EXPECT_EQ(R(conversionOK, 0xefffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xaf\xbf\xbe"));
> +
> +  // U+EFFFF
> +  EXPECT_EQ(R(conversionOK, 0xeffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xaf\xbf\xbf"));
> +
> +  // U+FFFFE
> +  EXPECT_EQ(R(conversionOK, 0xffffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf\xbf\xbe"));
> +
> +  // U+FFFFF
> +  EXPECT_EQ(R(conversionOK, 0xfffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf\xbf\xbf"));
> +
> +  // U+10FFFE
> +  EXPECT_EQ(R(conversionOK, 0x10fffe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf\xbe"));
> +
> +  // U+10FFFF
> +  EXPECT_EQ(R(conversionOK, 0x10ffff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf\xbf"));
> +
> +  // U+FDD0
> +  EXPECT_EQ(R(conversionOK, 0xfdd0),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x90"));
> +
> +  // U+FDD1
> +  EXPECT_EQ(R(conversionOK, 0xfdd1),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x91"));
> +
> +  // U+FDD2
> +  EXPECT_EQ(R(conversionOK, 0xfdd2),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x92"));
> +
> +  // U+FDD3
> +  EXPECT_EQ(R(conversionOK, 0xfdd3),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x93"));
> +
> +  // U+FDD4
> +  EXPECT_EQ(R(conversionOK, 0xfdd4),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x94"));
> +
> +  // U+FDD5
> +  EXPECT_EQ(R(conversionOK, 0xfdd5),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x95"));
> +
> +  // U+FDD6
> +  EXPECT_EQ(R(conversionOK, 0xfdd6),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x96"));
> +
> +  // U+FDD7
> +  EXPECT_EQ(R(conversionOK, 0xfdd7),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x97"));
> +
> +  // U+FDD8
> +  EXPECT_EQ(R(conversionOK, 0xfdd8),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x98"));
> +
> +  // U+FDD9
> +  EXPECT_EQ(R(conversionOK, 0xfdd9),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x99"));
> +
> +  // U+FDDA
> +  EXPECT_EQ(R(conversionOK, 0xfdda),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9a"));
> +
> +  // U+FDDB
> +  EXPECT_EQ(R(conversionOK, 0xfddb),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9b"));
> +
> +  // U+FDDC
> +  EXPECT_EQ(R(conversionOK, 0xfddc),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9c"));
> +
> +  // U+FDDD
> +  EXPECT_EQ(R(conversionOK, 0xfddd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9d"));
> +
> +  // U+FDDE
> +  EXPECT_EQ(R(conversionOK, 0xfdde),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9e"));
> +
> +  // U+FDDF
> +  EXPECT_EQ(R(conversionOK, 0xfddf),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9f"));
> +
> +  // U+FDE0
> +  EXPECT_EQ(R(conversionOK, 0xfde0),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa0"));
> +
> +  // U+FDE1
> +  EXPECT_EQ(R(conversionOK, 0xfde1),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa1"));
> +
> +  // U+FDE2
> +  EXPECT_EQ(R(conversionOK, 0xfde2),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa2"));
> +
> +  // U+FDE3
> +  EXPECT_EQ(R(conversionOK, 0xfde3),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa3"));
> +
> +  // U+FDE4
> +  EXPECT_EQ(R(conversionOK, 0xfde4),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa4"));
> +
> +  // U+FDE5
> +  EXPECT_EQ(R(conversionOK, 0xfde5),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa5"));
> +
> +  // U+FDE6
> +  EXPECT_EQ(R(conversionOK, 0xfde6),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa6"));
> +
> +  // U+FDE7
> +  EXPECT_EQ(R(conversionOK, 0xfde7),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa7"));
> +
> +  // U+FDE8
> +  EXPECT_EQ(R(conversionOK, 0xfde8),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa8"));
> +
> +  // U+FDE9
> +  EXPECT_EQ(R(conversionOK, 0xfde9),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa9"));
> +
> +  // U+FDEA
> +  EXPECT_EQ(R(conversionOK, 0xfdea),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xaa"));
> +
> +  // U+FDEB
> +  EXPECT_EQ(R(conversionOK, 0xfdeb),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xab"));
> +
> +  // U+FDEC
> +  EXPECT_EQ(R(conversionOK, 0xfdec),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xac"));
> +
> +  // U+FDED
> +  EXPECT_EQ(R(conversionOK, 0xfded),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xad"));
> +
> +  // U+FDEE
> +  EXPECT_EQ(R(conversionOK, 0xfdee),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xae"));
> +
> +  // U+FDEF
> +  EXPECT_EQ(R(conversionOK, 0xfdef),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xaf"));
> +
> +  // U+FDF0
> +  EXPECT_EQ(R(conversionOK, 0xfdf0),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb0"));
> +
> +  // U+FDF1
> +  EXPECT_EQ(R(conversionOK, 0xfdf1),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb1"));
> +
> +  // U+FDF2
> +  EXPECT_EQ(R(conversionOK, 0xfdf2),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb2"));
> +
> +  // U+FDF3
> +  EXPECT_EQ(R(conversionOK, 0xfdf3),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb3"));
> +
> +  // U+FDF4
> +  EXPECT_EQ(R(conversionOK, 0xfdf4),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb4"));
> +
> +  // U+FDF5
> +  EXPECT_EQ(R(conversionOK, 0xfdf5),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb5"));
> +
> +  // U+FDF6
> +  EXPECT_EQ(R(conversionOK, 0xfdf6),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb6"));
> +
> +  // U+FDF7
> +  EXPECT_EQ(R(conversionOK, 0xfdf7),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb7"));
> +
> +  // U+FDF8
> +  EXPECT_EQ(R(conversionOK, 0xfdf8),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb8"));
> +
> +  // U+FDF9
> +  EXPECT_EQ(R(conversionOK, 0xfdf9),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb9"));
> +
> +  // U+FDFA
> +  EXPECT_EQ(R(conversionOK, 0xfdfa),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xba"));
> +
> +  // U+FDFB
> +  EXPECT_EQ(R(conversionOK, 0xfdfb),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbb"));
> +
> +  // U+FDFC
> +  EXPECT_EQ(R(conversionOK, 0xfdfc),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbc"));
> +
> +  // U+FDFD
> +  EXPECT_EQ(R(conversionOK, 0xfdfd),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbd"));
> +
> +  // U+FDFE
> +  EXPECT_EQ(R(conversionOK, 0xfdfe),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbe"));
> +
> +  // U+FDFF
> +  EXPECT_EQ(R(conversionOK, 0xfdff),
> +      ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbf"));
> +}
> +
> +std::pair<ConversionResult, std::vector<unsigned>>
> +ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
> +  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
> +
> +  const UTF8 *SourceNext = SourceStart;
> +  std::vector<UTF32> Decoded(S.size(), 0);
> +  UTF32 *TargetStart = Decoded.data();
> +
> +  auto Result = ConvertUTF8toUTF32Partial(
> +      &SourceNext, SourceStart + S.size(), &TargetStart,
> +      Decoded.data() + Decoded.size(), lenientConversion);
> +
> +  Decoded.resize(TargetStart - Decoded.data());
> +
> +  return std::make_pair(Result, Decoded);
> +}
> +
> +TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
> +  // U+0041 LATIN CAPITAL LETTER A
> +  EXPECT_EQ(R(conversionOK, 0x0041),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\x41"));
> +
> +  //
> +  // Sequences with one continuation byte missing
> +  //
> +
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xc2"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xdf"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xe0\xa0"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xe0\xbf"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xe1\x80"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xec\xbf"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xed\x80"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xed\x9f"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xee\x80"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xef\xbf"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xf0\x90\x80"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xf0\xbf\xbf"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xf1\x80\x80"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xf3\xbf\xbf"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xf4\x80\x80"));
> +  EXPECT_EQ(R0(sourceExhausted),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\xf4\x8f\xbf"));
> +
> +  EXPECT_EQ(R(sourceExhausted, 0x0041),
> +      ConvertUTF8ToUnicodeScalarsPartialLenient("\x41\xc2"));
> +}
> +
> +#undef R0
> +#undef R
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits



More information about the llvm-commits mailing list