3 #include "fly/concepts/concepts.hpp"
4 #include "fly/types/string/concepts.hpp"
5 #include "fly/types/string/detail/traits.hpp"
6 #include "fly/types/string/literals.hpp"
14 namespace fly::detail {
29 template <fly::StandardCharacter CharType>
33 using string_type =
typename traits::string_type;
34 using view_type =
typename traits::view_type;
35 using codepoint_type =
typename traits::codepoint_type;
48 template <
typename IteratorType>
61 template <
typename DesiredStringType>
77 template <
typename DesiredStringType,
typename OutputIteratorType>
92 template <
typename IteratorType>
93 static std::optional<codepoint_type>
104 static std::optional<string_type>
encode_codepoint(codepoint_type codepoint);
134 template <
char UnicodePrefix = 'U',
typename IteratorType>
135 requires fly::UnicodePrefixCharacter<UnicodePrefix>
136 static std::optional<string_type>
escape_codepoint(IteratorType &it,
const IteratorType &end);
157 template <
typename IteratorType>
177 template <
char UnicodePrefix>
178 requires fly::UnicodePrefixCharacter<UnicodePrefix>
193 template <
char UnicodePrefix,
typename IteratorType>
194 requires fly::UnicodePrefixCharacter<UnicodePrefix>
207 template <
typename IteratorType>
208 requires fly::SizeOfTypeIs<CharType, 1>
209 static codepoint_type codepoint_from_string(IteratorType &it,
const IteratorType &end);
221 template <
typename IteratorType>
222 requires fly::SizeOfTypeIs<CharType, 2>
223 static codepoint_type codepoint_from_string(IteratorType &it,
const IteratorType &end);
235 template <
typename IteratorType>
236 requires fly::SizeOfTypeIs<CharType, 4>
237 static codepoint_type codepoint_from_string(IteratorType &it,
const IteratorType &end);
247 template <
typename OutputIteratorType>
248 requires fly::SizeOfTypeIs<CharType, 1>
249 static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
259 template <
typename OutputIteratorType>
260 requires fly::SizeOfTypeIs<CharType, 2>
261 static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
271 template <
typename OutputIteratorType>
272 requires fly::SizeOfTypeIs<CharType, 4>
273 static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
286 static codepoint_type
287 create_codepoint_from_surrogates(std::function<codepoint_type()> next_codepoint);
296 static bool validate_codepoint(codepoint_type codepoint);
309 template <
typename IteratorType>
310 static codepoint_type next_encoded_byte(IteratorType &it,
const IteratorType &end);
318 const codepoint_type m_leading_byte;
321 const codepoint_type m_encoding_mask;
324 const codepoint_type m_codepoint_mask;
327 const codepoint_type m_codepoint_size;
330 static constexpr
const std::array<Utf8Data, 4> s_utf8_leading_bytes = {{
332 {0b0000'0000, 0b1000'0000, 0b0111'1111, 1},
335 {0b1100'0000, 0b1110'0000, 0b0001'1111, 2},
338 {0b1110'0000, 0b1111'0000, 0b0000'1111, 3},
341 {0b1111'0000, 0b1111'1000, 0b0000'0111, 4},
344 static constexpr
const Utf8Data s_utf8_continuation_byte =
345 {0b1000'0000, 0b1100'0000, 0b0011'1111, 6};
347 static constexpr
const codepoint_type s_high_surrogate_min = 0xd800;
348 static constexpr
const codepoint_type s_high_surrogate_max = 0xdbff;
349 static constexpr
const codepoint_type s_low_surrogate_min = 0xdc00;
350 static constexpr
const codepoint_type s_low_surrogate_max = 0xdfff;
351 static constexpr
const codepoint_type s_max_codepoint = 0x10ffff;
352 static constexpr
const codepoint_type s_invalid_codepoint = 0xffffffff;
354 static constexpr
const auto s_zero = FLY_CHR(CharType,
'0');
355 static constexpr
const auto s_nine = FLY_CHR(CharType,
'9');
356 static constexpr
const auto s_lower_a = FLY_CHR(CharType,
'a');
357 static constexpr
const auto s_upper_a = FLY_CHR(CharType,
'A');
358 static constexpr
const auto s_lower_f = FLY_CHR(CharType,
'f');
359 static constexpr
const auto s_upper_f = FLY_CHR(CharType,
'F');
360 static constexpr
const auto s_lower_u = FLY_CHR(CharType,
'u');
361 static constexpr
const auto s_upper_u = FLY_CHR(CharType,
'U');
365 template <fly::StandardCharacter CharType>
366 template <
typename IteratorType>
371 if (!decode_codepoint(it, end))
381 template <fly::StandardCharacter CharType>
382 template <
typename DesiredStringType>
385 DesiredStringType result;
386 result.reserve(
static_cast<typename DesiredStringType::size_type
>(value.size()));
388 if (convert_encoding_into<DesiredStringType>(std::move(value), std::back_inserter(result)))
397 template <fly::StandardCharacter CharType>
398 template <
typename DesiredStringType,
typename OutputIteratorType>
403 auto it = value.cbegin();
404 const auto end = value.cend();
408 if (
auto codepoint = decode_codepoint(it, end); codepoint)
410 DesiredUnicodeType::codepoint_to_string(*codepoint, out);
421 template <fly::StandardCharacter CharType>
422 template <
typename IteratorType>
424 -> std::optional<codepoint_type>
426 const codepoint_type codepoint = codepoint_from_string(it, end);
428 if (validate_codepoint(codepoint))
437 template <fly::StandardCharacter CharType>
439 -> std::optional<string_type>
441 if (validate_codepoint(codepoint))
444 codepoint_to_string(codepoint, std::back_inserter(result));
453 template <fly::StandardCharacter CharType>
454 template <
char UnicodePrefix,
typename IteratorType>
455 requires fly::UnicodePrefixCharacter<UnicodePrefix>
457 -> std::optional<string_type>
459 if (
auto codepoint = decode_codepoint(it, end); codepoint)
461 return escape_codepoint<UnicodePrefix>(*codepoint);
468 template <fly::StandardCharacter CharType>
469 template <
typename IteratorType>
471 -> std::optional<string_type>
473 auto escaped_with = [&it, &end](
const CharType ch) ->
bool {
474 if ((it == end) || ((it + 1) == end))
479 return (*it ==
'\\') && (*(it + 1) == ch);
482 codepoint_type codepoint = s_invalid_codepoint;
484 if (escaped_with(s_lower_u))
486 auto next_codepoint = [&it, &end]() -> codepoint_type {
487 return unescape_codepoint<s_lower_u>(it, end);
490 codepoint = create_codepoint_from_surrogates(std::move(next_codepoint));
492 else if (escaped_with(s_upper_u))
494 codepoint = unescape_codepoint<s_upper_u>(it, end);
497 return encode_codepoint(codepoint);
501 template <fly::StandardCharacter CharType>
502 template <
char UnicodePrefix>
503 requires fly::UnicodePrefixCharacter<UnicodePrefix>
509 auto to_hex = [&codepoint](std::size_t length) -> string_type {
510 static const auto *s_digits = FLY_STR(CharType,
"0123456789abcdef");
511 string_type hex(length, FLY_CHR(CharType,
'0'));
513 for (std::size_t i = 0, j = (length - 1) * 4; i < length; ++i, j -= 4)
515 hex[i] = s_digits[(codepoint >> j) & 0x0f];
521 if ((codepoint <= 0x1f) || (codepoint >= 0x7f))
523 if (codepoint <= 0xffff)
525 result += FLY_CHR(CharType,
'\\');
531 if constexpr (UnicodePrefix ==
'u')
533 const codepoint_type high_surrogate = 0xd7c0 + (codepoint >> 10);
534 const codepoint_type low_surrogate = 0xdc00 + (codepoint & 0x3ff);
536 result += escape_codepoint<UnicodePrefix>(high_surrogate);
537 result += escape_codepoint<UnicodePrefix>(low_surrogate);
541 result += FLY_CHR(CharType,
'\\');
549 result +=
static_cast<CharType
>(codepoint);
556 template <fly::StandardCharacter CharType>
557 template <
char UnicodePrefix,
typename IteratorType>
558 requires fly::UnicodePrefixCharacter<UnicodePrefix>
562 if ((it == end) || (*it !=
'\\') || (++it == end) || (*it != UnicodePrefix))
564 return s_invalid_codepoint;
567 codepoint_type codepoint = 0;
570 static constexpr
const codepoint_type s_expected_digits = (UnicodePrefix ==
'u') ? 4 : 8;
571 codepoint_type i = 0;
573 for (i = 0; (i < s_expected_digits) && (it != end); ++i, ++it)
575 const codepoint_type shift = (4 * (s_expected_digits - i - 1));
577 if ((*it >= s_zero) && (*it <= s_nine))
579 codepoint +=
static_cast<codepoint_type
>(*it - 0x30) << shift;
581 else if ((*it >= s_upper_a) && (*it <= s_upper_f))
583 codepoint +=
static_cast<codepoint_type
>(*it - 0x37) << shift;
585 else if ((*it >= s_lower_a) && (*it <= s_lower_f))
587 codepoint +=
static_cast<codepoint_type
>(*it - 0x57) << shift;
591 return s_invalid_codepoint;
595 return (i == s_expected_digits) ? codepoint : s_invalid_codepoint;
599 template <fly::StandardCharacter CharType>
600 template <
typename IteratorType>
601 requires fly::SizeOfTypeIs<CharType, 1>
602 auto BasicUnicode<CharType>::codepoint_from_string(IteratorType &it,
const IteratorType &end)
605 const codepoint_type leading_byte = next_encoded_byte(it, end);
608 auto utf8_it = std::find_if(
609 s_utf8_leading_bytes.begin(),
610 s_utf8_leading_bytes.end(),
611 [&leading_byte](
const auto &candidate) {
612 return (leading_byte & candidate.m_encoding_mask) == candidate.m_leading_byte;
615 if (utf8_it == s_utf8_leading_bytes.end())
617 return s_invalid_codepoint;
620 const std::size_t bytes = utf8_it->m_codepoint_size;
621 std::size_t shift = s_utf8_continuation_byte.m_codepoint_size * (bytes - 1);
624 codepoint_type codepoint = (leading_byte & utf8_it->m_codepoint_mask) << shift;
626 for (std::size_t i = 1; i < bytes; ++i)
628 const codepoint_type continuation_byte = next_encoded_byte(it, end);
630 if ((continuation_byte & s_utf8_continuation_byte.m_encoding_mask) !=
631 s_utf8_continuation_byte.m_leading_byte)
633 return s_invalid_codepoint;
636 shift -= s_utf8_continuation_byte.m_codepoint_size;
637 codepoint |= (continuation_byte & s_utf8_continuation_byte.m_codepoint_mask) << shift;
641 if (((codepoint < 0x80) && (bytes != 1)) ||
642 ((codepoint >= 0x80) && (codepoint < 0x800) && (bytes != 2)) ||
643 ((codepoint >= 0x800) && (codepoint < 0x10000) && (bytes != 3)))
645 return s_invalid_codepoint;
652 template <fly::StandardCharacter CharType>
653 template <
typename IteratorType>
654 requires fly::SizeOfTypeIs<CharType, 2>
655 auto BasicUnicode<CharType>::codepoint_from_string(IteratorType &it,
const IteratorType &end)
658 auto next_codepoint = [&it, &end]() -> codepoint_type {
659 return next_encoded_byte(it, end);
662 return create_codepoint_from_surrogates(std::move(next_codepoint));
666 template <fly::StandardCharacter CharType>
667 template <
typename IteratorType>
668 requires fly::SizeOfTypeIs<CharType, 4>
669 auto BasicUnicode<CharType>::codepoint_from_string(IteratorType &it,
const IteratorType &end)
672 return next_encoded_byte(it, end);
676 template <fly::StandardCharacter CharType>
677 template <
typename OutputIteratorType>
678 requires fly::SizeOfTypeIs<CharType, 1>
679 void BasicUnicode<CharType>::codepoint_to_string(codepoint_type codepoint, OutputIteratorType out)
681 if (codepoint < 0x80)
683 *out++ =
static_cast<CharType
>(codepoint);
685 else if (codepoint < 0x800)
687 *out++ =
static_cast<CharType
>(0xc0 | (codepoint >> 6));
688 *out++ =
static_cast<CharType
>(0x80 | (codepoint & 0x3f));
690 else if (codepoint < 0x10000)
692 *out++ =
static_cast<CharType
>(0xe0 | (codepoint >> 12));
693 *out++ =
static_cast<CharType
>(0x80 | ((codepoint >> 6) & 0x3f));
694 *out++ =
static_cast<CharType
>(0x80 | (codepoint & 0x3f));
698 *out++ =
static_cast<CharType
>(0xf0 | (codepoint >> 18));
699 *out++ =
static_cast<CharType
>(0x80 | ((codepoint >> 12) & 0x3f));
700 *out++ =
static_cast<CharType
>(0x80 | ((codepoint >> 6) & 0x3f));
701 *out++ =
static_cast<CharType
>(0x80 | (codepoint & 0x3f));
706 template <fly::StandardCharacter CharType>
707 template <
typename OutputIteratorType>
708 requires fly::SizeOfTypeIs<CharType, 2>
709 void BasicUnicode<CharType>::codepoint_to_string(codepoint_type codepoint, OutputIteratorType out)
711 if (codepoint < 0x10000)
713 *out++ =
static_cast<CharType
>(codepoint);
717 codepoint -= 0x10000;
718 *out++ =
static_cast<CharType
>(s_high_surrogate_min | (codepoint >> 10));
719 *out++ =
static_cast<CharType
>(s_low_surrogate_min | (codepoint & 0x3ff));
724 template <fly::StandardCharacter CharType>
725 template <
typename OutputIteratorType>
726 requires fly::SizeOfTypeIs<CharType, 4>
727 void BasicUnicode<CharType>::codepoint_to_string(codepoint_type codepoint, OutputIteratorType out)
729 *out++ =
static_cast<CharType
>(codepoint);
733 template <fly::StandardCharacter CharType>
734 auto BasicUnicode<CharType>::create_codepoint_from_surrogates(
735 std::function<codepoint_type()> next_codepoint) -> codepoint_type
737 auto is_high_surrogate = [](codepoint_type c) ->
bool {
738 return (c >= s_high_surrogate_min) && (c <= s_high_surrogate_max);
740 auto is_low_surrogate = [](codepoint_type c) ->
bool {
741 return (c >= s_low_surrogate_min) && (c <= s_low_surrogate_max);
744 codepoint_type codepoint = next_codepoint();
746 if (is_high_surrogate(codepoint))
748 const codepoint_type low_surrogate = next_codepoint();
750 if (is_low_surrogate(low_surrogate))
757 codepoint = (codepoint << 10) + low_surrogate - 0x35fdc00;
761 return s_invalid_codepoint;
764 else if (is_low_surrogate(codepoint))
766 return s_invalid_codepoint;
773 template <fly::StandardCharacter CharType>
774 bool BasicUnicode<CharType>::validate_codepoint(codepoint_type codepoint)
776 if ((codepoint >= s_high_surrogate_min) && (codepoint <= s_low_surrogate_max))
781 else if (codepoint > s_max_codepoint)
791 template <fly::StandardCharacter CharType>
792 template <
typename IteratorType>
793 inline auto BasicUnicode<CharType>::next_encoded_byte(IteratorType &it,
const IteratorType &end)
796 return (it == end) ? s_invalid_codepoint :
static_cast<codepoint_type
>(*(it++));
Definition: unicode.hpp:31
static std::optional< codepoint_type > decode_codepoint(IteratorType &it, const IteratorType &end)
static std::optional< string_type > encode_codepoint(codepoint_type codepoint)
Definition: unicode.hpp:438
static std::optional< string_type > unescape_codepoint(IteratorType &it, const IteratorType &end)
static bool validate_encoding(IteratorType &it, const IteratorType &end)
Definition: unicode.hpp:367
static std::optional< DesiredStringType > convert_encoding(view_type value)
Definition: unicode.hpp:383
requires static fly::UnicodePrefixCharacter< UnicodePrefix > std::optional< string_type > escape_codepoint(IteratorType &it, const IteratorType &end)
static bool convert_encoding_into(view_type value, OutputIteratorType out)
Definition: unicode.hpp:399
Definition: traits.hpp:18