[ Index ]

PHP Cross Reference of WordPress Trunk (Updated Daily)

Search

title

Body

[close]

/wp-includes/ -> compat-utf8.php (source)

   1  <?php
   2  
   3  /**
   4   * Fallback mechanism for safely validating UTF-8 bytes.
   5   *
   6   * By implementing a raw method here the code will behave in the same way on
   7   * all installed systems, regardless of what extensions are installed.
   8   *
   9   * @see wp_is_valid_utf8
  10   *
  11   * @since 6.9.0
  12   * @access private
  13   *
  14   * @param string $bytes String which might contain text encoded as UTF-8.
  15   * @return bool Whether the provided bytes can decode as valid UTF-8.
  16   */
  17  function _wp_is_valid_utf8_fallback( string $bytes ): bool {
  18      $end = strlen( $bytes );
  19  
  20      for ( $i = 0; $i < $end; $i++ ) {
  21          /*
  22           * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
  23           *
  24           * This optimization step improves the speed from 10x to 100x
  25           * depending on whether the JIT has optimized the function.
  26           */
  27          $i += strspn(
  28              $bytes,
  29              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
  30              "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
  31              " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
  32              $i
  33          );
  34          if ( $i >= $end ) {
  35              break;
  36          }
  37  
  38          /**
  39           * The above fast-track handled all single-byte UTF-8 characters. What
  40           * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
  41           *
  42           * Therefore everything past here is checking those multibyte sequences.
  43           * Because it’s possible that there are truncated characters, the use of
  44           * the null-coalescing operator with "\xC0" is a convenience for skipping
  45           * length checks on every continuation bytes. This works because 0xC0 is
  46           * always invalid in a UTF-8 string, meaning that if the string has been
  47           * truncated, it will find 0xC0 and reject as invalid UTF-8.
  48           *
  49           *  > [The following table] lists all of the byte sequences that are well-formed
  50           * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
  51           * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
  52           * > outside of the ranges listed is ill-formed.
  53           *
  54           * > Table 3-7. Well-Formed UTF-8 Byte Sequences
  55           *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
  56           *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
  57           *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
  58           *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
  59           *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
  60           *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
  61           *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
  62           *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
  63           *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
  64           *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
  65           *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
  66           *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
  67           *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
  68           *
  69           * Notice that all valid third and forth bytes are in the range 80..BF. This
  70           * validator takes advantage of that to only check the range of those bytes once.
  71           *
  72           * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
  73           * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
  74           */
  75  
  76          $b1 = ord( $bytes[ $i ] );
  77          $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
  78  
  79          // Valid two-byte code points.
  80  
  81          if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
  82              ++$i;
  83              continue;
  84          }
  85  
  86          $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
  87  
  88          // Valid three-byte code points.
  89  
  90          if ( $b3 < 0x80 || $b3 > 0xBF ) {
  91              return false;
  92          }
  93  
  94          if (
  95              ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
  96              ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
  97              ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
  98              ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
  99          ) {
 100              $i += 2;
 101              continue;
 102          }
 103  
 104          $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
 105  
 106          // Valid four-byte code points.
 107  
 108          if ( $b4 < 0x80 || $b4 > 0xBF ) {
 109              return false;
 110          }
 111  
 112          if (
 113              ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
 114              ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 115              ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
 116          ) {
 117              $i += 3;
 118              continue;
 119          }
 120  
 121          // Any other sequence is invalid.
 122          return false;
 123      }
 124  
 125      // Reaching the end implies validating every byte.
 126      return true;
 127  }


Generated : Tue Sep 16 08:20:04 2025 Cross-referenced by PHPXref