[ Index ]

PHP Cross Reference of WordPress Trunk (Updated Daily)

Search

title

Body

[close]

/wp-includes/ -> compat-utf8.php (source)

   1  <?php
   2  
   3  /**
   4   * Finds spans of valid and invalid UTF-8 bytes in a given string.
   5   *
   6   * This is a low-level tool to power various UTF-8 functionality.
   7   * It scans through a string until it finds invalid byte spans.
   8   * When it does this, it does three things:
   9   *
  10   *  - Assigns `$at` to the position after the last successful code point.
  11   *  - Assigns `$invalid_length` to the length of the maximal subpart of
  12   *    the invalid bytes starting at `$at`.
  13   *  - Returns how many code points were successfully scanned.
  14   *
  15   * This information is enough to build a number of useful UTF-8 functions.
  16   *
  17   * Example:
  18   *
  19   *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
  20   *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
  21   *     $at = $invalid_length = 0;
  22   *
  23   *     // The first step finds the invalid 0xF1 byte.
  24   *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
  25   *     $at === 2; $invalid_length === 1;
  26   *
  27   *     // The second step continues to the end of the string.
  28   *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
  29   *     $at === 4; $invalid_length === 0;
  30   *
  31   * Note! This functions many arguments are passed without and “options”
  32   * array. This choice is based on the fact that this is a low-level function
  33   * and there’s no need to create an array of items on every invocation.
  34   *
  35   * @since 6.9.0
  36   * @access private
  37   *
  38   * @param string   $bytes           UTF-8 encoded string which might include invalid spans of bytes.
  39   * @param int      $at              Where to start scanning.
  40   * @param int      $invalid_length  Will be set to how many bytes are to be ignored after `$at`.
  41   * @param int|null $max_bytes       Stop scanning after this many bytes have been seen.
  42   * @param int|null $max_code_points Stop scanning after this many code points have been seen.
  43   * @return int How many code points were successfully scanned.
  44   */
  45  function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
  46      $byte_length    = strlen( $bytes );
  47      $end            = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
  48      $invalid_length = 0;
  49      $count          = 0;
  50      $max_count      = $max_code_points ?? PHP_INT_MAX;
  51  
  52      for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
  53          /*
  54           * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
  55           *
  56           * This optimization step improves the speed from 10x to 100x
  57           * depending on whether the JIT has optimized the function.
  58           */
  59          $ascii_byte_count = strspn(
  60              $bytes,
  61              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
  62              "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
  63              " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
  64              $i,
  65              $end - $i
  66          );
  67  
  68          if ( $count + $ascii_byte_count >= $max_count ) {
  69              $at    = $i + ( $max_count - $count );
  70              $count = $max_count;
  71              return $count;
  72          }
  73  
  74          $count += $ascii_byte_count;
  75          $i     += $ascii_byte_count;
  76  
  77          if ( $i >= $end ) {
  78              $at = $end;
  79              return $count;
  80          }
  81  
  82          /**
  83           * The above fast-track handled all single-byte UTF-8 characters. What
  84           * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
  85           *
  86           * Therefore everything past here is checking those multibyte sequences.
  87           *
  88           * It may look like there’s a need to check against the max bytes here,
  89           * but since each match of a single character returns, this functions will
  90           * bail already if crossing the max-bytes threshold. This function SHALL
  91           * NOT return in the middle of a multi-byte character, so if a character
  92           * falls on each side of the max bytes, the entire character will be scanned.
  93           *
  94           * Because it’s possible that there are truncated characters, the use of
  95           * the null-coalescing operator with "\xC0" is a convenience for skipping
  96           * length checks on every continuation bytes. This works because 0xC0 is
  97           * always invalid in a UTF-8 string, meaning that if the string has been
  98           * truncated, it will find 0xC0 and reject as invalid UTF-8.
  99           *
 100           * > [The following table] lists all of the byte sequences that are well-formed
 101           * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
 102           * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
 103           * > outside of the ranges listed is ill-formed.
 104           *
 105           * > Table 3-7. Well-Formed UTF-8 Byte Sequences
 106           *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
 107           *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
 108           *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
 109           *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
 110           *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
 111           *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
 112           *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
 113           *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
 114           *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
 115           *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
 116           *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
 117           *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
 118           *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
 119           *
 120           * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
 121           */
 122  
 123          // Valid two-byte code points.
 124          $b1 = ord( $bytes[ $i ] );
 125          $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
 126  
 127          if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
 128              ++$count;
 129              ++$i;
 130              continue;
 131          }
 132  
 133          // Valid three-byte code points.
 134          $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
 135  
 136          if ( $b3 < 0x80 || $b3 > 0xBF ) {
 137              goto invalid_utf8;
 138          }
 139  
 140          if (
 141              ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
 142              ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 143              ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
 144              ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
 145          ) {
 146              ++$count;
 147              $i += 2;
 148              continue;
 149          }
 150  
 151          // Valid four-byte code points.
 152          $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
 153  
 154          if ( $b4 < 0x80 || $b4 > 0xBF ) {
 155              goto invalid_utf8;
 156          }
 157  
 158          if (
 159              ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
 160              ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 161              ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
 162          ) {
 163              ++$count;
 164              $i += 3;
 165              continue;
 166          }
 167  
 168          /**
 169           * When encountering invalid byte sequences, Unicode suggests finding the
 170           * maximal subpart of a text and replacing that subpart with a single
 171           * replacement character.
 172           *
 173           * > This practice is more secure because it does not result in the
 174           * > conversion consuming parts of valid sequences as though they were
 175           * > invalid. It also guarantees at least one replacement character will
 176           * > occur for each instance of an invalid sequence in the original text.
 177           * > Furthermore, this practice can be defined consistently for better
 178           * > interoperability between different implementations of conversion.
 179           *
 180           * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
 181           */
 182          invalid_utf8:
 183          $at             = $i;
 184          $invalid_length = 1;
 185  
 186          // Single-byte and two-byte characters.
 187          if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
 188              return $count;
 189          }
 190  
 191          $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
 192          $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
 193  
 194          // Find the maximal subpart and skip past it.
 195          if ( 0xE0 === ( $b1 & 0xF0 ) ) {
 196              // Three-byte characters.
 197              $b2_valid = (
 198                  ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
 199                  ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 200                  ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
 201                  ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
 202              );
 203  
 204              $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
 205              return $count;
 206          } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
 207              // Four-byte characters.
 208              $b2_valid = (
 209                  ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
 210                  ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 211                  ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
 212              );
 213  
 214              $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
 215  
 216              $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
 217              return $count;
 218          }
 219  
 220          return $count;
 221      }
 222  
 223      $at = $i;
 224      return $count;
 225  }
 226  
 227  /**
 228   * Fallback mechanism for safely validating UTF-8 bytes.
 229   *
 230   * @since 6.9.0
 231   * @access private
 232   *
 233   * @see wp_is_valid_utf8()
 234   *
 235   * @param string $bytes String which might contain text encoded as UTF-8.
 236   * @return bool Whether the provided bytes can decode as valid UTF-8.
 237   */
 238  function _wp_is_valid_utf8_fallback( string $bytes ): bool {
 239      $bytes_length = strlen( $bytes );
 240      if ( 0 === $bytes_length ) {
 241          return true;
 242      }
 243  
 244      $next_byte_at   = 0;
 245      $invalid_length = 0;
 246  
 247      _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
 248  
 249      return $bytes_length === $next_byte_at && 0 === $invalid_length;
 250  }
 251  
 252  /**
 253   * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
 254   *
 255   * Example:
 256   *
 257   *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
 258   *
 259   * @since 6.9.0
 260   * @access private
 261   *
 262   * @see wp_scrub_utf8()
 263   *
 264   * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
 265   * @return string Input string with spans of invalid bytes swapped with the replacement character.
 266   */
 267  function _wp_scrub_utf8_fallback( string $bytes ): string {
 268      $bytes_length   = strlen( $bytes );
 269      $next_byte_at   = 0;
 270      $was_at         = 0;
 271      $invalid_length = 0;
 272      $scrubbed       = '';
 273  
 274      while ( $next_byte_at <= $bytes_length ) {
 275          _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
 276  
 277          if ( $next_byte_at >= $bytes_length ) {
 278              if ( 0 === $was_at ) {
 279                  return $bytes;
 280              }
 281  
 282              return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
 283          }
 284  
 285          $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
 286          $scrubbed .= "\u{FFFD}";
 287  
 288          $next_byte_at += $invalid_length;
 289          $was_at        = $next_byte_at;
 290      }
 291  
 292      return $scrubbed;
 293  }


Generated : Fri Oct 10 08:20:03 2025 Cross-referenced by PHPXref