PHPXRef 0.7.1 : WordPress Trunk (Updated Daily) : /wp-includes/utf8.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  if ( extension_loaded( 'mbstring' ) ) :
   4      /**
   5       * Determines if a given byte string represents a valid UTF-8 encoding.
   6       *
   7       * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
   8       * it is still possible. Many texts are simultaneously valid UTF-8,
   9       * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
  10       *
  11       * Example:
  12       *
  13       *     true === wp_is_valid_utf8( '' );
  14       *     true === wp_is_valid_utf8( 'just a test' );
  15       *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
  16       *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
  17       *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
  18       *
  19       *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
  20       *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
  21       *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
  22       *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
  23       *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
  24       *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
  25       *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
  26       *
  27       *  A “valid” string consists of “well-formed UTF-8 code unit sequence[s],” meaning
  28       *  that the bytes conform to the UTF-8 encoding scheme, all characters use the minimal
  29       *  byte sequence required by UTF-8, and that no sequence encodes a UTF-16 surrogate
  30       *  code point or any character above the representable range.
  31       *
  32       * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G32860
  33       *
  34       * @since 6.9.0
  35       *
  36       * @param string $bytes String which might contain text encoded as UTF-8.
  37       * @return bool Whether the provided bytes can decode as valid UTF-8.
  38       */
  39  	function wp_is_valid_utf8( string $bytes ): bool {
  40          return mb_check_encoding( $bytes, 'UTF-8' );
  41      }
  42  else :
  43      /**
  44       * Fallback function for validating UTF-8.
  45       *
  46       * @ignore
  47       * @private
  48       *
  49       * @since 6.9.0
  50       */
  51  	function wp_is_valid_utf8( string $string ): bool {
  52          return _wp_is_valid_utf8_fallback( $string );
  53      }
  54  endif;
  55  
  56  if (
  57      extension_loaded( 'mbstring' ) &&
  58      // Maximal subpart substitution introduced by php/php-src@04e59c916f12b322ac55f22314e31bd0176d01cb.
  59      version_compare( PHP_VERSION, '8.1.6', '>=' )
  60  ) :
  61      /**
  62       * Replaces ill-formed UTF-8 byte sequences with the Unicode Replacement Character.
  63       *
  64       * Knowing what to do in the presence of text encoding issues can be complicated.
  65       * This function replaces invalid spans of bytes to neutralize any corruption that
  66       * may be there and prevent it from causing further problems downstream.
  67       *
  68       * However, it’s not always ideal to replace those bytes. In some settings it may
  69       * be best to leave the invalid bytes in the string so that downstream code can handle
  70       * them in a specific way. Replacing the bytes too early, like escaping for HTML too
  71       * early, can introduce other forms of corruption and data loss.
  72       *
  73       * When in doubt, use this function to replace spans of invalid bytes.
  74       *
  75       * Replacement follows the “maximal subpart” algorithm for secure and interoperable
  76       * strings. This can lead to sequences of multiple replacement characters in a row.
  77       *
  78       * Example:
  79       *
  80       *     // Valid strings come through unchanged.
  81       *     'test' === wp_scrub_utf8( 'test' );
  82       *
  83       *     // Invalid sequences of bytes are replaced.
  84       *     $invalid = "the byte \xC0 is never allowed in a UTF-8 string.";
  85       *     "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_scrub_utf8( $invalid, true );
  86       *     'the byte � is never allowed in a UTF-8 string.' === wp_scrub_utf8( $invalid, true );
  87       *
  88       *     // Maximal subparts are replaced individually.
  89       *     '.�.' === wp_scrub_utf8( ".\xC0." );              // C0 is never valid.
  90       *     '.�.' === wp_scrub_utf8( ".\xE2\x8C." );          // Missing A3 at end.
  91       *     '.��.' === wp_scrub_utf8( ".\xE2\x8C\xE2\x8C." ); // Maximal subparts replaced separately.
  92       *     '.��.' === wp_scrub_utf8( ".\xC1\xBF." );         // Overlong sequence.
  93       *     '.���.' === wp_scrub_utf8( ".\xED\xA0\x80." );    // Surrogate half.
  94       *
  95       * Note! The Unicode Replacement Character is itself a Unicode character (U+FFFD).
  96       * Once a span of invalid bytes has been replaced by one, it will not be possible
  97       * to know whether the replacement character was originally intended to be there
  98       * or if it is the result of scrubbing bytes. It is ideal to leave replacement for
  99       * display only, but some contexts (e.g. generating XML or passing data into a
 100       * large language model) require valid input strings.
 101       *
 102       * @since 6.9.0
 103       *
 104       * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
 105       *
 106       * @param string $text String which is assumed to be UTF-8 but may contain invalid sequences of bytes.
 107       * @return string Input text with invalid sequences of bytes replaced with the Unicode replacement character.
 108       */
 109  	function wp_scrub_utf8( $text ) {
 110          /*
 111           * While it looks like setting the substitute character could fail,
 112           * the internal PHP code will never fail when provided a valid
 113           * code point as a number. In this case, there’s no need to check
 114           * its return value to see if it succeeded.
 115           */
 116          $prev_replacement_character = mb_substitute_character();
 117          mb_substitute_character( 0xFFFD );
 118          $scrubbed = mb_scrub( $text, 'UTF-8' );
 119          mb_substitute_character( $prev_replacement_character );
 120  
 121          return $scrubbed;
 122      }
 123  else :
 124      /**
 125       * Fallback function for scrubbing UTF-8.
 126       *
 127       * @ignore
 128       * @private
 129       *
 130       * @since 6.9.0
 131       */
 132  	function wp_scrub_utf8( $text ) {
 133          return _wp_scrub_utf8_fallback( $text );
 134      }
 135  endif;
 136  
 137  /**
 138   * Returns whether the given string contains Unicode noncharacters.
 139   *
 140   * XML recommends against using noncharacters and HTML forbids their
 141   * use in attribute names. Unicode recommends that they not be used
 142   * in open exchange of data.
 143   *
 144   * Noncharacters are code points within the following ranges:
 145   *  - U+FDD0–U+FDEF
 146   *  - U+FFFE–U+FFFF
 147   *  - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
 148   *
 149   * @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
 150   * @see https://www.w3.org/TR/xml/#charsets
 151   * @see https://html.spec.whatwg.org/#attributes-2
 152   *
 153   * @since 6.9.0
 154   *
 155   * @param string $text Are there noncharacters in this string?
 156   * @return bool Whether noncharacters were found in the string.
 157   */
 158  function wp_has_noncharacters( string $text ): bool {
 159      /*
 160       * Match the UTF-8 byte sequences directly so malformed UTF-8 elsewhere
 161       * in the subject does not cause PCRE's Unicode mode to reject the string.
 162       */
 163      return 1 === preg_match(
 164          '~
 165              # U+FDD0-U+FDEF, U+FFFE-U+FFFF
 166              \xEF(?:\xB7[\x90-\xAF]|\xBF[\xBE\xBF])
 167              |
 168              # U+nFFFE/U+nFFFF
 169              (?:\xF0[\x9F\xAF\xBF]|[\xF1-\xF3][\x8F\x9F\xAF\xBF]|\xF4\x8F)\xBF[\xBE\xBF]
 170          ~x',
 171          $text
 172      );
 173  }
PHP Cross Reference of WordPress Trunk (Updated Daily)

/wp-includes/ -> utf8.php (source)