PHPXRef 0.7.1 : WordPress Trunk (Updated Daily) : /wp-includes/compat-utf8.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /**
   4   * Finds spans of valid and invalid UTF-8 bytes in a given string.
   5   *
   6   * This is a low-level tool to power various UTF-8 functionality.
   7   * It scans through a string until it finds invalid byte spans.
   8   * When it does this, it does three things:
   9   *
  10   *  - Assigns `$at` to the position after the last successful code point.
  11   *  - Assigns `$invalid_length` to the length of the maximal subpart of
  12   *    the invalid bytes starting at `$at`.
  13   *  - Returns how many code points were successfully scanned.
  14   *
  15   * This information is enough to build a number of useful UTF-8 functions.
  16   *
  17   * Example:
  18   *
  19   *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
  20   *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
  21   *     $at = $invalid_length = 0;
  22   *
  23   *     // The first step finds the invalid 0xF1 byte.
  24   *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
  25   *     $at === 2; $invalid_length === 1;
  26   *
  27   *     // The second step continues to the end of the string.
  28   *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
  29   *     $at === 4; $invalid_length === 0;
  30   *
  31   * Note! While passing an options array here might be convenient from a calling-code standpoint,
  32   *       this function is intended to serve as a very low-level foundation upon which to build
  33   *       higher level functionality. For the sake of keeping costs explicit all arguments are
  34   *       passed directly.
  35   *
  36   * @since 6.9.0
  37   * @access private
  38   *
  39   * @param string    $bytes             UTF-8 encoded string which might include invalid spans of bytes.
  40   * @param int       $at                Where to start scanning.
  41   * @param int       $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
  42   * @param int|null  $max_bytes         Stop scanning after this many bytes have been seen.
  43   * @param int|null  $max_code_points   Stop scanning after this many code points have been seen.
  44   * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
  45   * @return int How many code points were successfully scanned.
  46   */
  47  function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
  48      $byte_length       = strlen( $bytes );
  49      $end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
  50      $invalid_length    = 0;
  51      $count             = 0;
  52      $max_count         = $max_code_points ?? PHP_INT_MAX;
  53      $has_noncharacters = false;
  54  
  55      for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
  56          /*
  57           * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
  58           *
  59           * This optimization step improves the speed from 10x to 100x
  60           * depending on whether the JIT has optimized the function.
  61           */
  62          $ascii_byte_count = strspn(
  63              $bytes,
  64              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
  65              "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
  66              " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
  67              $i,
  68              $end - $i
  69          );
  70  
  71          if ( $count + $ascii_byte_count >= $max_count ) {
  72              $at    = $i + ( $max_count - $count );
  73              $count = $max_count;
  74              return $count;
  75          }
  76  
  77          $count += $ascii_byte_count;
  78          $i     += $ascii_byte_count;
  79  
  80          if ( $i >= $end ) {
  81              $at = $end;
  82              return $count;
  83          }
  84  
  85          /**
  86           * The above fast-track handled all single-byte UTF-8 characters. What
  87           * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
  88           *
  89           * Therefore everything past here is checking those multibyte sequences.
  90           *
  91           * It may look like there’s a need to check against the max bytes here,
  92           * but since each match of a single character returns, this functions will
  93           * bail already if crossing the max-bytes threshold. This function SHALL
  94           * NOT return in the middle of a multi-byte character, so if a character
  95           * falls on each side of the max bytes, the entire character will be scanned.
  96           *
  97           * Because it’s possible that there are truncated characters, the use of
  98           * the null-coalescing operator with "\xC0" is a convenience for skipping
  99           * length checks on every continuation bytes. This works because 0xC0 is
 100           * always invalid in a UTF-8 string, meaning that if the string has been
 101           * truncated, it will find 0xC0 and reject as invalid UTF-8.
 102           *
 103           * > [The following table] lists all of the byte sequences that are well-formed
 104           * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
 105           * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
 106           * > outside of the ranges listed is ill-formed.
 107           *
 108           * > Table 3-7. Well-Formed UTF-8 Byte Sequences
 109           *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
 110           *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
 111           *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
 112           *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
 113           *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
 114           *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
 115           *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
 116           *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
 117           *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
 118           *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
 119           *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
 120           *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
 121           *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
 122           *
 123           * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
 124           */
 125  
 126          // Valid two-byte code points.
 127          $b1 = ord( $bytes[ $i ] );
 128          $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
 129  
 130          if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
 131              ++$count;
 132              ++$i;
 133              continue;
 134          }
 135  
 136          // Valid three-byte code points.
 137          $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
 138  
 139          if ( $b3 < 0x80 || $b3 > 0xBF ) {
 140              goto invalid_utf8;
 141          }
 142  
 143          if (
 144              ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
 145              ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 146              ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
 147              ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
 148          ) {
 149              ++$count;
 150              $i += 2;
 151  
 152              // Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
 153              if ( 0xEF === $b1 ) {
 154                  $has_noncharacters |= (
 155                      ( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
 156                      ( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
 157                  );
 158              }
 159  
 160              continue;
 161          }
 162  
 163          // Valid four-byte code points.
 164          $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
 165  
 166          if ( $b4 < 0x80 || $b4 > 0xBF ) {
 167              goto invalid_utf8;
 168          }
 169  
 170          if (
 171              ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
 172              ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 173              ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
 174          ) {
 175              ++$count;
 176              $i += 3;
 177  
 178              // Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
 179              $has_noncharacters |= (
 180                  ( 0x0F === ( $b2 & 0x0F ) ) &&
 181                  0xBF === $b3 &&
 182                  ( 0xBE === $b4 || 0xBF === $b4 )
 183              );
 184  
 185              continue;
 186          }
 187  
 188          /**
 189           * When encountering invalid byte sequences, Unicode suggests finding the
 190           * maximal subpart of a text and replacing that subpart with a single
 191           * replacement character.
 192           *
 193           * > This practice is more secure because it does not result in the
 194           * > conversion consuming parts of valid sequences as though they were
 195           * > invalid. It also guarantees at least one replacement character will
 196           * > occur for each instance of an invalid sequence in the original text.
 197           * > Furthermore, this practice can be defined consistently for better
 198           * > interoperability between different implementations of conversion.
 199           *
 200           * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
 201           */
 202          invalid_utf8:
 203          $at             = $i;
 204          $invalid_length = 1;
 205  
 206          // Single-byte and two-byte characters.
 207          if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
 208              return $count;
 209          }
 210  
 211          $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
 212          $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
 213  
 214          // Find the maximal subpart and skip past it.
 215          if ( 0xE0 === ( $b1 & 0xF0 ) ) {
 216              // Three-byte characters.
 217              $b2_valid = (
 218                  ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
 219                  ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 220                  ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
 221                  ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
 222              );
 223  
 224              $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
 225              return $count;
 226          } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
 227              // Four-byte characters.
 228              $b2_valid = (
 229                  ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
 230                  ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 231                  ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
 232              );
 233  
 234              $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
 235  
 236              $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
 237              return $count;
 238          }
 239  
 240          return $count;
 241      }
 242  
 243      $at = $i;
 244      return $count;
 245  }
 246  
 247  /**
 248   * Fallback mechanism for safely validating UTF-8 bytes.
 249   *
 250   * @since 6.9.0
 251   * @access private
 252   *
 253   * @see wp_is_valid_utf8()
 254   *
 255   * @param string $bytes String which might contain text encoded as UTF-8.
 256   * @return bool Whether the provided bytes can decode as valid UTF-8.
 257   */
 258  function _wp_is_valid_utf8_fallback( string $bytes ): bool {
 259      $bytes_length = strlen( $bytes );
 260      if ( 0 === $bytes_length ) {
 261          return true;
 262      }
 263  
 264      $next_byte_at   = 0;
 265      $invalid_length = 0;
 266  
 267      _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
 268  
 269      return $bytes_length === $next_byte_at && 0 === $invalid_length;
 270  }
 271  
 272  /**
 273   * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
 274   *
 275   * Example:
 276   *
 277   *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
 278   *
 279   * @since 6.9.0
 280   * @access private
 281   *
 282   * @see wp_scrub_utf8()
 283   *
 284   * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
 285   * @return string Input string with spans of invalid bytes swapped with the replacement character.
 286   */
 287  function _wp_scrub_utf8_fallback( string $bytes ): string {
 288      $bytes_length   = strlen( $bytes );
 289      $next_byte_at   = 0;
 290      $was_at         = 0;
 291      $invalid_length = 0;
 292      $scrubbed       = '';
 293  
 294      while ( $next_byte_at <= $bytes_length ) {
 295          _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
 296  
 297          if ( $next_byte_at >= $bytes_length ) {
 298              if ( 0 === $was_at ) {
 299                  return $bytes;
 300              }
 301  
 302              return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
 303          }
 304  
 305          $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
 306          $scrubbed .= "\u{FFFD}";
 307  
 308          $next_byte_at += $invalid_length;
 309          $was_at        = $next_byte_at;
 310      }
 311  
 312      return $scrubbed;
 313  }
 314  
 315  /**
 316   * Returns how many code points are found in the given UTF-8 string.
 317   *
 318   * Invalid spans of bytes count as a single code point according
 319   * to the maximal subpart rule. This function is a fallback method
 320   * for calling `mb_strlen( $text, 'UTF-8' )`.
 321   *
 322   * When negative values are provided for the byte offsets or length,
 323   * this will always report zero code points.
 324   *
 325   * Example:
 326   *
 327   *     4  === _wp_utf8_codepoint_count( 'text' );
 328   *
 329   *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
 330   *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
 331   *
 332   * @since 6.9.0
 333   * @access private
 334   *
 335   * @param string $text            Count code points in this string.
 336   * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
 337   * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
 338   *                                Default is to scan until the end of the string. Must be positive.
 339   * @return int How many code points were found.
 340   */
 341  function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
 342      if ( $byte_offset < 0 ) {
 343          return 0;
 344      }
 345  
 346      $count           = 0;
 347      $at              = $byte_offset;
 348      $end             = strlen( $text );
 349      $invalid_length  = 0;
 350      $max_byte_length = min( $end - $at, $max_byte_length );
 351  
 352      while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
 353          $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
 354          $count += $invalid_length > 0 ? 1 : 0;
 355          $at    += $invalid_length;
 356      }
 357  
 358      return $count;
 359  }
 360  
 361  /**
 362   * Given a starting offset within a string and a maximum number of code points,
 363   * return how many bytes are occupied by the span of characters.
 364   *
 365   * Invalid spans of bytes count as a single code point according to the maximal
 366   * subpart rule. This function is a fallback method for calling
 367   * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
 368   *
 369   * @since 6.9.0
 370   * @access private
 371   *
 372   * @param string $text              Count bytes of span in this text.
 373   * @param int    $byte_offset       Start counting at this byte offset.
 374   * @param int    $max_code_points   Stop counting after this many code points have been seen,
 375   *                                  or at the end of the string.
 376   * @param ?int   $found_code_points Optional. Will be set to number of found code points in
 377   *                                  span, as this might be smaller than the maximum count if
 378   *                                  the string is not long enough.
 379   * @return int Number of bytes spanned by the code points.
 380   */
 381  function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
 382      $was_at            = $byte_offset;
 383      $invalid_length    = 0;
 384      $end               = strlen( $text );
 385      $found_code_points = 0;
 386  
 387      while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
 388          $needed      = $max_code_points - $found_code_points;
 389          $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
 390  
 391          $found_code_points += $chunk_count;
 392  
 393          // Invalid spans only convey one code point count regardless of how long they are.
 394          if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
 395              ++$found_code_points;
 396              $byte_offset += $invalid_length;
 397          }
 398      }
 399  
 400      return $byte_offset - $was_at;
 401  }
 402  
 403  /**
 404   * Fallback support for determining if a string contains Unicode noncharacters.
 405   *
 406   * @since 6.9.0
 407   * @access private
 408   *
 409   * @see \wp_has_noncharacters()
 410   *
 411   * @param string $text Are there noncharacters in this string?
 412   * @return bool Whether noncharacters were found in the string.
 413   */
 414  function _wp_has_noncharacters_fallback( string $text ): bool {
 415      $at                = 0;
 416      $invalid_length    = 0;
 417      $has_noncharacters = false;
 418      $end               = strlen( $text );
 419  
 420      while ( $at < $end && ! $has_noncharacters ) {
 421          _wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
 422          $at += $invalid_length;
 423      }
 424  
 425      return $has_noncharacters;
 426  }
 427  
 428  /**
 429   * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
 430   * with the deprecated function from the PHP standard library.
 431   *
 432   * @since 6.9.0
 433   * @access private
 434   *
 435   * @see \utf8_encode()
 436   *
 437   * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
 438   * @return string Text converted into UTF-8.
 439   */
 440  function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
 441      $iso_8859_1_text = (string) $iso_8859_1_text;
 442      $at              = 0;
 443      $was_at          = 0;
 444      $end             = strlen( $iso_8859_1_text );
 445      $utf8            = '';
 446  
 447      while ( $at < $end ) {
 448          // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
 449          $ascii_byte_count = strspn(
 450              $iso_8859_1_text,
 451              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
 452              "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
 453              " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
 454              $at
 455          );
 456  
 457          if ( $ascii_byte_count > 0 ) {
 458              $at += $ascii_byte_count;
 459              continue;
 460          }
 461  
 462          // All other bytes transform into two-byte UTF-8 sequences.
 463          $code_point = ord( $iso_8859_1_text[ $at ] );
 464          $byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
 465          $byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
 466  
 467          $utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
 468          $utf8 .= "{$byte1}{$byte2}";
 469  
 470          ++$at;
 471          $was_at = $at;
 472      }
 473  
 474      if ( 0 === $was_at ) {
 475          return $iso_8859_1_text;
 476      }
 477  
 478      $utf8 .= substr( $iso_8859_1_text, $was_at );
 479      return $utf8;
 480  }
 481  
 482  /**
 483   * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
 484   * with the deprecated function from the PHP standard library.
 485   *
 486   * @since 6.9.0
 487   * @access private
 488   *
 489   * @see \utf8_decode()
 490   *
 491   * @param string $utf8_text Text treated as UTF-8 bytes.
 492   * @return string Text converted into ISO-8859-1.
 493   */
 494  function _wp_utf8_decode_fallback( $utf8_text ) {
 495      $utf8_text       = (string) $utf8_text;
 496      $at              = 0;
 497      $was_at          = 0;
 498      $end             = strlen( $utf8_text );
 499      $iso_8859_1_text = '';
 500  
 501      while ( $at < $end ) {
 502          // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
 503          $ascii_byte_count = strspn(
 504              $utf8_text,
 505              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
 506              "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
 507              " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
 508              $at
 509          );
 510  
 511          if ( $ascii_byte_count > 0 ) {
 512              $at += $ascii_byte_count;
 513              continue;
 514          }
 515  
 516          $next_at        = $at;
 517          $invalid_length = 0;
 518          $found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
 519          $span_length    = $next_at - $at;
 520          $next_byte      = '?';
 521  
 522          if ( 1 !== $found ) {
 523              if ( $invalid_length > 0 ) {
 524                  $next_byte = '';
 525                  goto flush_sub_part;
 526              }
 527  
 528              break;
 529          }
 530  
 531          // All convertible code points are two-bytes long.
 532          $byte1 = ord( $utf8_text[ $at ] );
 533          if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
 534              goto flush_sub_part;
 535          }
 536  
 537          // All convertible code points are not greater than U+FF.
 538          $byte2      = ord( $utf8_text[ $at + 1 ] );
 539          $code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
 540          if ( $code_point > 0xFF ) {
 541              goto flush_sub_part;
 542          }
 543  
 544          $next_byte = chr( $code_point );
 545  
 546          flush_sub_part:
 547          $iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
 548          $iso_8859_1_text .= $next_byte;
 549          $at              += $span_length;
 550          $was_at           = $at;
 551  
 552          if ( $invalid_length > 0 ) {
 553              $iso_8859_1_text .= '?';
 554              $at              += $invalid_length;
 555              $was_at           = $at;
 556          }
 557      }
 558  
 559      if ( 0 === $was_at ) {
 560          return $utf8_text;
 561      }
 562  
 563      $iso_8859_1_text .= substr( $utf8_text, $was_at );
 564      return $iso_8859_1_text;
 565  }
PHP Cross Reference of WordPress Trunk (Updated Daily)

/wp-includes/ -> compat-utf8.php (source)