libfly  6.2.2
C++20 utility library for Linux, macOS, and Windows
unicode.hpp
1 #pragma once
2 
3 #include "fly/concepts/concepts.hpp"
4 #include "fly/types/string/concepts.hpp"
5 #include "fly/types/string/detail/traits.hpp"
6 #include "fly/types/string/literals.hpp"
7 
8 #include <array>
9 #include <functional>
10 #include <iterator>
11 #include <optional>
12 #include <string>
13 
14 namespace fly::detail {
15 
29 template <fly::StandardCharacter CharType>
31 {
33  using string_type = typename traits::string_type;
34  using view_type = typename traits::view_type;
35  using codepoint_type = typename traits::codepoint_type;
36 
37 public:
48  template <typename IteratorType>
49  static bool validate_encoding(IteratorType &it, const IteratorType &end);
50 
61  template <typename DesiredStringType>
62  static std::optional<DesiredStringType> convert_encoding(view_type value);
63 
77  template <typename DesiredStringType, typename OutputIteratorType>
78  static bool convert_encoding_into(view_type value, OutputIteratorType out);
79 
92  template <typename IteratorType>
93  static std::optional<codepoint_type>
94  decode_codepoint(IteratorType &it, const IteratorType &end);
95 
104  static std::optional<string_type> encode_codepoint(codepoint_type codepoint);
105 
134  template <char UnicodePrefix = 'U', typename IteratorType>
135  requires fly::UnicodePrefixCharacter<UnicodePrefix>
136  static std::optional<string_type> escape_codepoint(IteratorType &it, const IteratorType &end);
137 
157  template <typename IteratorType>
158  static std::optional<string_type> unescape_codepoint(IteratorType &it, const IteratorType &end);
159 
160 private:
161  friend BasicUnicode<char>;
162  friend BasicUnicode<wchar_t>;
163  friend BasicUnicode<char8_t>;
164  friend BasicUnicode<char16_t>;
165  friend BasicUnicode<char32_t>;
166 
177  template <char UnicodePrefix>
178  requires fly::UnicodePrefixCharacter<UnicodePrefix>
179  static string_type escape_codepoint(codepoint_type codepoint);
180 
193  template <char UnicodePrefix, typename IteratorType>
194  requires fly::UnicodePrefixCharacter<UnicodePrefix>
195  static codepoint_type unescape_codepoint(IteratorType &it, const IteratorType &end);
196 
207  template <typename IteratorType>
208  requires fly::SizeOfTypeIs<CharType, 1>
209  static codepoint_type codepoint_from_string(IteratorType &it, const IteratorType &end);
210 
221  template <typename IteratorType>
222  requires fly::SizeOfTypeIs<CharType, 2>
223  static codepoint_type codepoint_from_string(IteratorType &it, const IteratorType &end);
224 
235  template <typename IteratorType>
236  requires fly::SizeOfTypeIs<CharType, 4>
237  static codepoint_type codepoint_from_string(IteratorType &it, const IteratorType &end);
238 
247  template <typename OutputIteratorType>
248  requires fly::SizeOfTypeIs<CharType, 1>
249  static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
250 
259  template <typename OutputIteratorType>
260  requires fly::SizeOfTypeIs<CharType, 2>
261  static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
262 
271  template <typename OutputIteratorType>
272  requires fly::SizeOfTypeIs<CharType, 4>
273  static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
274 
286  static codepoint_type
287  create_codepoint_from_surrogates(std::function<codepoint_type()> next_codepoint);
288 
296  static bool validate_codepoint(codepoint_type codepoint);
297 
309  template <typename IteratorType>
310  static codepoint_type next_encoded_byte(IteratorType &it, const IteratorType &end);
311 
315  struct Utf8Data
316  {
317  // The value of the UTF-8 encoded leading byte.
318  const codepoint_type m_leading_byte;
319 
320  // A bit-mask of the bits in the UTF-8 encoded leading byte reserved for encoding.
321  const codepoint_type m_encoding_mask;
322 
323  // A bit-mask of the bits in the UTF-8 encoded leading byte reserved for codepoint data.
324  const codepoint_type m_codepoint_mask;
325 
326  // The number of bytes required to decode the codepoint.
327  const codepoint_type m_codepoint_size;
328  };
329 
330  static constexpr const std::array<Utf8Data, 4> s_utf8_leading_bytes = {{
331  // Codepoint length 1, range [U+0000, U+007f], leading byte 0b0xxx'xxxx.
332  {0b0000'0000, 0b1000'0000, 0b0111'1111, 1},
333 
334  // Codepoint length 2, range [U+0080, U+07FF], leading byte 0b110x'xxxx.
335  {0b1100'0000, 0b1110'0000, 0b0001'1111, 2},
336 
337  // Codepoint length 3, range [U+0800, U+FFFF], leading byte 0b1110'xxxx.
338  {0b1110'0000, 0b1111'0000, 0b0000'1111, 3},
339 
340  // Codepoint length 4, range [U+10000, U+10FFFF], leading byte 0b1111'0xxx.
341  {0b1111'0000, 0b1111'1000, 0b0000'0111, 4},
342  }};
343 
344  static constexpr const Utf8Data s_utf8_continuation_byte =
345  {0b1000'0000, 0b1100'0000, 0b0011'1111, 6};
346 
347  static constexpr const codepoint_type s_high_surrogate_min = 0xd800;
348  static constexpr const codepoint_type s_high_surrogate_max = 0xdbff;
349  static constexpr const codepoint_type s_low_surrogate_min = 0xdc00;
350  static constexpr const codepoint_type s_low_surrogate_max = 0xdfff;
351  static constexpr const codepoint_type s_max_codepoint = 0x10ffff;
352  static constexpr const codepoint_type s_invalid_codepoint = 0xffffffff;
353 
354  static constexpr const auto s_zero = FLY_CHR(CharType, '0');
355  static constexpr const auto s_nine = FLY_CHR(CharType, '9');
356  static constexpr const auto s_lower_a = FLY_CHR(CharType, 'a');
357  static constexpr const auto s_upper_a = FLY_CHR(CharType, 'A');
358  static constexpr const auto s_lower_f = FLY_CHR(CharType, 'f');
359  static constexpr const auto s_upper_f = FLY_CHR(CharType, 'F');
360  static constexpr const auto s_lower_u = FLY_CHR(CharType, 'u');
361  static constexpr const auto s_upper_u = FLY_CHR(CharType, 'U');
362 };
363 
364 //==================================================================================================
365 template <fly::StandardCharacter CharType>
366 template <typename IteratorType>
367 bool BasicUnicode<CharType>::validate_encoding(IteratorType &it, const IteratorType &end)
368 {
369  while (it != end)
370  {
371  if (!decode_codepoint(it, end))
372  {
373  return false;
374  }
375  }
376 
377  return true;
378 }
379 
380 //==================================================================================================
381 template <fly::StandardCharacter CharType>
382 template <typename DesiredStringType>
383 inline std::optional<DesiredStringType> BasicUnicode<CharType>::convert_encoding(view_type value)
384 {
385  DesiredStringType result;
386  result.reserve(static_cast<typename DesiredStringType::size_type>(value.size()));
387 
388  if (convert_encoding_into<DesiredStringType>(std::move(value), std::back_inserter(result)))
389  {
390  return result;
391  }
392 
393  return std::nullopt;
394 }
395 
396 //==================================================================================================
397 template <fly::StandardCharacter CharType>
398 template <typename DesiredStringType, typename OutputIteratorType>
399 bool BasicUnicode<CharType>::convert_encoding_into(view_type value, OutputIteratorType out)
400 {
401  using DesiredUnicodeType = BasicUnicode<typename DesiredStringType::value_type>;
402 
403  auto it = value.cbegin();
404  const auto end = value.cend();
405 
406  while (it != end)
407  {
408  if (auto codepoint = decode_codepoint(it, end); codepoint)
409  {
410  DesiredUnicodeType::codepoint_to_string(*codepoint, out);
411  continue;
412  }
413 
414  return false;
415  }
416 
417  return true;
418 }
419 
420 //==================================================================================================
421 template <fly::StandardCharacter CharType>
422 template <typename IteratorType>
423 auto BasicUnicode<CharType>::decode_codepoint(IteratorType &it, const IteratorType &end)
424  -> std::optional<codepoint_type>
425 {
426  const codepoint_type codepoint = codepoint_from_string(it, end);
427 
428  if (validate_codepoint(codepoint))
429  {
430  return codepoint;
431  }
432 
433  return std::nullopt;
434 }
435 
436 //==================================================================================================
437 template <fly::StandardCharacter CharType>
438 auto BasicUnicode<CharType>::encode_codepoint(codepoint_type codepoint)
439  -> std::optional<string_type>
440 {
441  if (validate_codepoint(codepoint))
442  {
443  string_type result;
444  codepoint_to_string(codepoint, std::back_inserter(result));
445 
446  return result;
447  }
448 
449  return std::nullopt;
450 }
451 
452 //==================================================================================================
453 template <fly::StandardCharacter CharType>
454 template <char UnicodePrefix, typename IteratorType>
455 requires fly::UnicodePrefixCharacter<UnicodePrefix>
456 auto BasicUnicode<CharType>::escape_codepoint(IteratorType &it, const IteratorType &end)
457  -> std::optional<string_type>
458 {
459  if (auto codepoint = decode_codepoint(it, end); codepoint)
460  {
461  return escape_codepoint<UnicodePrefix>(*codepoint);
462  }
463 
464  return std::nullopt;
465 }
466 
467 //==================================================================================================
468 template <fly::StandardCharacter CharType>
469 template <typename IteratorType>
470 auto BasicUnicode<CharType>::unescape_codepoint(IteratorType &it, const IteratorType &end)
471  -> std::optional<string_type>
472 {
473  auto escaped_with = [&it, &end](const CharType ch) -> bool {
474  if ((it == end) || ((it + 1) == end))
475  {
476  return false;
477  }
478 
479  return (*it == '\\') && (*(it + 1) == ch);
480  };
481 
482  codepoint_type codepoint = s_invalid_codepoint;
483 
484  if (escaped_with(s_lower_u))
485  {
486  auto next_codepoint = [&it, &end]() -> codepoint_type {
487  return unescape_codepoint<s_lower_u>(it, end);
488  };
489 
490  codepoint = create_codepoint_from_surrogates(std::move(next_codepoint));
491  }
492  else if (escaped_with(s_upper_u))
493  {
494  codepoint = unescape_codepoint<s_upper_u>(it, end);
495  }
496 
497  return encode_codepoint(codepoint);
498 }
499 
500 //==================================================================================================
501 template <fly::StandardCharacter CharType>
502 template <char UnicodePrefix>
503 requires fly::UnicodePrefixCharacter<UnicodePrefix>
504 auto BasicUnicode<CharType>::escape_codepoint(codepoint_type codepoint) -> string_type
505 {
506  string_type result;
507 
508  // TODO: Replace this with BasicString::format without actually including formatters.hpp.
509  auto to_hex = [&codepoint](std::size_t length) -> string_type {
510  static const auto *s_digits = FLY_STR(CharType, "0123456789abcdef");
511  string_type hex(length, FLY_CHR(CharType, '0'));
512 
513  for (std::size_t i = 0, j = (length - 1) * 4; i < length; ++i, j -= 4)
514  {
515  hex[i] = s_digits[(codepoint >> j) & 0x0f];
516  }
517 
518  return hex;
519  };
520 
521  if ((codepoint <= 0x1f) || (codepoint >= 0x7f))
522  {
523  if (codepoint <= 0xffff)
524  {
525  result += FLY_CHR(CharType, '\\');
526  result += s_lower_u;
527  result += to_hex(4);
528  }
529  else
530  {
531  if constexpr (UnicodePrefix == 'u')
532  {
533  const codepoint_type high_surrogate = 0xd7c0 + (codepoint >> 10);
534  const codepoint_type low_surrogate = 0xdc00 + (codepoint & 0x3ff);
535 
536  result += escape_codepoint<UnicodePrefix>(high_surrogate);
537  result += escape_codepoint<UnicodePrefix>(low_surrogate);
538  }
539  else
540  {
541  result += FLY_CHR(CharType, '\\');
542  result += s_upper_u;
543  result += to_hex(8);
544  }
545  }
546  }
547  else
548  {
549  result += static_cast<CharType>(codepoint);
550  }
551 
552  return result;
553 }
554 
555 //==================================================================================================
556 template <fly::StandardCharacter CharType>
557 template <char UnicodePrefix, typename IteratorType>
558 requires fly::UnicodePrefixCharacter<UnicodePrefix>
559 auto BasicUnicode<CharType>::unescape_codepoint(IteratorType &it, const IteratorType &end)
560  -> codepoint_type
561 {
562  if ((it == end) || (*it != '\\') || (++it == end) || (*it != UnicodePrefix))
563  {
564  return s_invalid_codepoint;
565  }
566 
567  codepoint_type codepoint = 0;
568  ++it;
569 
570  static constexpr const codepoint_type s_expected_digits = (UnicodePrefix == 'u') ? 4 : 8;
571  codepoint_type i = 0;
572 
573  for (i = 0; (i < s_expected_digits) && (it != end); ++i, ++it)
574  {
575  const codepoint_type shift = (4 * (s_expected_digits - i - 1));
576 
577  if ((*it >= s_zero) && (*it <= s_nine))
578  {
579  codepoint += static_cast<codepoint_type>(*it - 0x30) << shift;
580  }
581  else if ((*it >= s_upper_a) && (*it <= s_upper_f))
582  {
583  codepoint += static_cast<codepoint_type>(*it - 0x37) << shift;
584  }
585  else if ((*it >= s_lower_a) && (*it <= s_lower_f))
586  {
587  codepoint += static_cast<codepoint_type>(*it - 0x57) << shift;
588  }
589  else
590  {
591  return s_invalid_codepoint;
592  }
593  }
594 
595  return (i == s_expected_digits) ? codepoint : s_invalid_codepoint;
596 }
597 
598 //==================================================================================================
599 template <fly::StandardCharacter CharType>
600 template <typename IteratorType>
601 requires fly::SizeOfTypeIs<CharType, 1>
602 auto BasicUnicode<CharType>::codepoint_from_string(IteratorType &it, const IteratorType &end)
603  -> codepoint_type
604 {
605  const codepoint_type leading_byte = next_encoded_byte(it, end);
606 
607  // First find the codepoint length by finding which leading byte matches the first encoded byte.
608  auto utf8_it = std::find_if(
609  s_utf8_leading_bytes.begin(),
610  s_utf8_leading_bytes.end(),
611  [&leading_byte](const auto &candidate) {
612  return (leading_byte & candidate.m_encoding_mask) == candidate.m_leading_byte;
613  });
614 
615  if (utf8_it == s_utf8_leading_bytes.end())
616  {
617  return s_invalid_codepoint;
618  }
619 
620  const std::size_t bytes = utf8_it->m_codepoint_size;
621  std::size_t shift = s_utf8_continuation_byte.m_codepoint_size * (bytes - 1);
622 
623  // Then decode the encoded bytes using the leading and continuation byte masks.
624  codepoint_type codepoint = (leading_byte & utf8_it->m_codepoint_mask) << shift;
625 
626  for (std::size_t i = 1; i < bytes; ++i)
627  {
628  const codepoint_type continuation_byte = next_encoded_byte(it, end);
629 
630  if ((continuation_byte & s_utf8_continuation_byte.m_encoding_mask) !=
631  s_utf8_continuation_byte.m_leading_byte)
632  {
633  return s_invalid_codepoint;
634  }
635 
636  shift -= s_utf8_continuation_byte.m_codepoint_size;
637  codepoint |= (continuation_byte & s_utf8_continuation_byte.m_codepoint_mask) << shift;
638  }
639 
640  // Finally, make sure the encoding was not overlong.
641  if (((codepoint < 0x80) && (bytes != 1)) ||
642  ((codepoint >= 0x80) && (codepoint < 0x800) && (bytes != 2)) ||
643  ((codepoint >= 0x800) && (codepoint < 0x10000) && (bytes != 3)))
644  {
645  return s_invalid_codepoint;
646  }
647 
648  return codepoint;
649 }
650 
651 //==================================================================================================
652 template <fly::StandardCharacter CharType>
653 template <typename IteratorType>
654 requires fly::SizeOfTypeIs<CharType, 2>
655 auto BasicUnicode<CharType>::codepoint_from_string(IteratorType &it, const IteratorType &end)
656  -> codepoint_type
657 {
658  auto next_codepoint = [&it, &end]() -> codepoint_type {
659  return next_encoded_byte(it, end);
660  };
661 
662  return create_codepoint_from_surrogates(std::move(next_codepoint));
663 }
664 
665 //==================================================================================================
666 template <fly::StandardCharacter CharType>
667 template <typename IteratorType>
668 requires fly::SizeOfTypeIs<CharType, 4>
669 auto BasicUnicode<CharType>::codepoint_from_string(IteratorType &it, const IteratorType &end)
670  -> codepoint_type
671 {
672  return next_encoded_byte(it, end);
673 }
674 
675 //==================================================================================================
676 template <fly::StandardCharacter CharType>
677 template <typename OutputIteratorType>
678 requires fly::SizeOfTypeIs<CharType, 1>
679 void BasicUnicode<CharType>::codepoint_to_string(codepoint_type codepoint, OutputIteratorType out)
680 {
681  if (codepoint < 0x80)
682  {
683  *out++ = static_cast<CharType>(codepoint);
684  }
685  else if (codepoint < 0x800)
686  {
687  *out++ = static_cast<CharType>(0xc0 | (codepoint >> 6));
688  *out++ = static_cast<CharType>(0x80 | (codepoint & 0x3f));
689  }
690  else if (codepoint < 0x10000)
691  {
692  *out++ = static_cast<CharType>(0xe0 | (codepoint >> 12));
693  *out++ = static_cast<CharType>(0x80 | ((codepoint >> 6) & 0x3f));
694  *out++ = static_cast<CharType>(0x80 | (codepoint & 0x3f));
695  }
696  else
697  {
698  *out++ = static_cast<CharType>(0xf0 | (codepoint >> 18));
699  *out++ = static_cast<CharType>(0x80 | ((codepoint >> 12) & 0x3f));
700  *out++ = static_cast<CharType>(0x80 | ((codepoint >> 6) & 0x3f));
701  *out++ = static_cast<CharType>(0x80 | (codepoint & 0x3f));
702  }
703 }
704 
705 //==================================================================================================
706 template <fly::StandardCharacter CharType>
707 template <typename OutputIteratorType>
708 requires fly::SizeOfTypeIs<CharType, 2>
709 void BasicUnicode<CharType>::codepoint_to_string(codepoint_type codepoint, OutputIteratorType out)
710 {
711  if (codepoint < 0x10000)
712  {
713  *out++ = static_cast<CharType>(codepoint);
714  }
715  else
716  {
717  codepoint -= 0x10000;
718  *out++ = static_cast<CharType>(s_high_surrogate_min | (codepoint >> 10));
719  *out++ = static_cast<CharType>(s_low_surrogate_min | (codepoint & 0x3ff));
720  }
721 }
722 
723 //==================================================================================================
724 template <fly::StandardCharacter CharType>
725 template <typename OutputIteratorType>
726 requires fly::SizeOfTypeIs<CharType, 4>
727 void BasicUnicode<CharType>::codepoint_to_string(codepoint_type codepoint, OutputIteratorType out)
728 {
729  *out++ = static_cast<CharType>(codepoint);
730 }
731 
732 //==================================================================================================
733 template <fly::StandardCharacter CharType>
734 auto BasicUnicode<CharType>::create_codepoint_from_surrogates(
735  std::function<codepoint_type()> next_codepoint) -> codepoint_type
736 {
737  auto is_high_surrogate = [](codepoint_type c) -> bool {
738  return (c >= s_high_surrogate_min) && (c <= s_high_surrogate_max);
739  };
740  auto is_low_surrogate = [](codepoint_type c) -> bool {
741  return (c >= s_low_surrogate_min) && (c <= s_low_surrogate_max);
742  };
743 
744  codepoint_type codepoint = next_codepoint();
745 
746  if (is_high_surrogate(codepoint))
747  {
748  const codepoint_type low_surrogate = next_codepoint();
749 
750  if (is_low_surrogate(low_surrogate))
751  {
752  // The formula to convert a surrogate pair to a single codepoint is:
753  //
754  // C = ((HS - 0xd800) * 0x400) + (LS - 0xdc00) + 0x10000
755  //
756  // Multiplying by 0x400 is the same as left-shifting 10 bits. The formula then becomes:
757  codepoint = (codepoint << 10) + low_surrogate - 0x35fdc00;
758  }
759  else
760  {
761  return s_invalid_codepoint;
762  }
763  }
764  else if (is_low_surrogate(codepoint))
765  {
766  return s_invalid_codepoint;
767  }
768 
769  return codepoint;
770 }
771 
772 //==================================================================================================
773 template <fly::StandardCharacter CharType>
774 bool BasicUnicode<CharType>::validate_codepoint(codepoint_type codepoint)
775 {
776  if ((codepoint >= s_high_surrogate_min) && (codepoint <= s_low_surrogate_max))
777  {
778  // Reserved codepoint.
779  return false;
780  }
781  else if (codepoint > s_max_codepoint)
782  {
783  // Out-of-range codepoint.
784  return false;
785  }
786 
787  return true;
788 }
789 
790 //==================================================================================================
791 template <fly::StandardCharacter CharType>
792 template <typename IteratorType>
793 inline auto BasicUnicode<CharType>::next_encoded_byte(IteratorType &it, const IteratorType &end)
794  -> codepoint_type
795 {
796  return (it == end) ? s_invalid_codepoint : static_cast<codepoint_type>(*(it++));
797 }
798 
799 } // namespace fly::detail
Definition: unicode.hpp:31
static std::optional< codepoint_type > decode_codepoint(IteratorType &it, const IteratorType &end)
static std::optional< string_type > encode_codepoint(codepoint_type codepoint)
Definition: unicode.hpp:438
static std::optional< string_type > unescape_codepoint(IteratorType &it, const IteratorType &end)
static bool validate_encoding(IteratorType &it, const IteratorType &end)
Definition: unicode.hpp:367
static std::optional< DesiredStringType > convert_encoding(view_type value)
Definition: unicode.hpp:383
requires static fly::UnicodePrefixCharacter< UnicodePrefix > std::optional< string_type > escape_codepoint(IteratorType &it, const IteratorType &end)
static bool convert_encoding_into(view_type value, OutputIteratorType out)
Definition: unicode.hpp:399
Definition: traits.hpp:18