[ Index ]

PHP Cross Reference of WordPress Trunk (Updated Daily)

Search

title

Body

[close]

/wp-includes/html-api/ -> class-wp-html-doctype-info.php (source)

   1  <?php
   2  /**
   3   * HTML API: WP_HTML_Doctype_Info class
   4   *
   5   * @package WordPress
   6   * @subpackage HTML-API
   7   * @since 6.7.0
   8   */
   9  
  10  /**
  11   * Core class used by the HTML API to represent a DOCTYPE declaration.
  12   *
  13   * This class parses DOCTYPE tokens for the full parser in the HTML Processor.
  14   * Most code interacting with HTML won't need to parse DOCTYPE declarations;
  15   * the HTML Processor is one exception. Consult the HTML Processor for proper
  16   * parsing of an HTML document.
  17   *
  18   * A DOCTYPE declaration may indicate its document compatibility mode, which impacts
  19   * the structure of the following HTML as well as the behavior of CSS class selectors.
  20   * There are three possible modes:
  21   *
  22   *  - "no-quirks" and "limited-quirks" modes (also called "standards mode").
  23   *  - "quirks" mode.
  24   *
  25   * These modes mostly determine whether CSS class name selectors match values in the
  26   * HTML `class` attribute in an ASCII-case-insensitive way (quirks mode), or whether
  27   * they match only when byte-for-byte identical (no-quirks mode).
  28   *
  29   * All HTML documents should start with the standard HTML5 DOCTYPE: `<!DOCTYPE html>`.
  30   *
  31   * > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different
  32   * > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a
  33   * > document ensures that the browser makes a best-effort attempt at following the
  34   * > relevant specifications.
  35   *
  36   * @see https://html.spec.whatwg.org/#the-doctype
  37   *
  38   * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
  39   * and an indication of which document compatibility mode they would imply if an HTML parser
  40   * hadn't already determined it from other information.
  41   *
  42   * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
  43   *
  44   * Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how
  45   * to interpret the various tags and entities within a document. Its role in HTML diverged
  46   * from how it was used in SGML and no meaning should be back-read into HTML based on how it
  47   * is used in SGML, XML, or XHTML documents.
  48   *
  49   * @see https://www.iso.org/standard/16387.html
  50   *
  51   * @since 6.7.0
  52   *
  53   * @access private
  54   *
  55   * @see WP_HTML_Processor
  56   */
  57  class WP_HTML_Doctype_Info {
  58      /**
  59       * Name of the DOCTYPE: should be "html" for HTML documents.
  60       *
  61       * This value should be considered "read only" and not modified.
  62       *
  63       * Historically the DOCTYPE name indicates name of the document's root element.
  64       *
  65       *     <!DOCTYPE html>
  66       *               ╰──┴── name is "html".
  67       *
  68       * @see https://html.spec.whatwg.org/#tokenization
  69       *
  70       * @since 6.7.0
  71       *
  72       * @var string|null
  73       */
  74      public $name = null;
  75  
  76      /**
  77       * Public identifier of the DOCTYPE.
  78       *
  79       * This value should be considered "read only" and not modified.
  80       *
  81       * The public identifier is optional and should not appear in HTML documents.
  82       * A `null` value indicates that no public identifier was present in the DOCTYPE.
  83       *
  84       * Historically the presence of the public identifier indicated that a document
  85       * was meant to be shared between computer systems and the value indicated to a
  86       * knowledgeable parser how to find the relevant document type definition (DTD).
  87       *
  88       *     <!DOCTYPE html PUBLIC "public id goes here in quotes">
  89       *               │  │         ╰─── public identifier ─────╯
  90       *               ╰──┴── name is "html".
  91       *
  92       * @see https://html.spec.whatwg.org/#tokenization
  93       *
  94       * @since 6.7.0
  95       *
  96       * @var string|null
  97       */
  98      public $public_identifier = null;
  99  
 100      /**
 101       * System identifier of the DOCTYPE.
 102       *
 103       * This value should be considered "read only" and not modified.
 104       *
 105       * The system identifier is optional and should not appear in HTML documents.
 106       * A `null` value indicates that no system identifier was present in the DOCTYPE.
 107       *
 108       * Historically the system identifier specified where a relevant document type
 109       * declaration for the given document is stored and may be retrieved.
 110       *
 111       *     <!DOCTYPE html SYSTEM "system id goes here in quotes">
 112       *               │  │         ╰──── system identifier ────╯
 113       *               ╰──┴── name is "html".
 114       *
 115       * If a public identifier were provided it would indicate to a knowledgeable
 116       * parser how to interpret the system identifier.
 117       *
 118       *     <!DOCTYPE html PUBLIC "public id goes here in quotes" "system id goes here in quotes">
 119       *               │  │         ╰─── public identifier ─────╯   ╰──── system identifier ────╯
 120       *               ╰──┴── name is "html".
 121       *
 122       * @see https://html.spec.whatwg.org/#tokenization
 123       *
 124       * @since 6.7.0
 125       *
 126       * @var string|null
 127       */
 128      public $system_identifier = null;
 129  
 130      /**
 131       * Which document compatibility mode this DOCTYPE declaration indicates.
 132       *
 133       * This value should be considered "read only" and not modified.
 134       *
 135       * When an HTML parser has not already set the document compatibility mode,
 136       * (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
 137       * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
 138       * indicate one of three possible document compatibility modes:
 139       *
 140       *  - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
 141       *  - "quirks" mode (also called `CSS1Compat` mode).
 142       *
 143       * An appropriate DOCTYPE is one encountered in the "initial" insertion mode,
 144       * before the HTML element has been opened and before finding any other
 145       * DOCTYPE declaration tokens.
 146       *
 147       * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
 148       *
 149       * @since 6.7.0
 150       *
 151       * @var string One of "no-quirks", "limited-quirks", or "quirks".
 152       */
 153      public $indicated_compatability_mode;
 154  
 155      /**
 156       * Constructor.
 157       *
 158       * This class should not be instantiated directly.
 159       * Use the static {@see self::from_doctype_token} method instead.
 160       *
 161       * The arguments to this constructor correspond to the "DOCTYPE token"
 162       * as defined in the HTML specification.
 163       *
 164       * > DOCTYPE tokens have a name, a public identifier, a system identifier,
 165       * > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier,
 166       * > and system identifier must be marked as missing (which is a distinct state from the
 167       * > empty string), and the force-quirks flag must be set to off (its other state is on).
 168       *
 169       * @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization
 170       *
 171       * @since 6.7.0
 172       *
 173       * @param string|null $name              Name of the DOCTYPE.
 174       * @param string|null $public_identifier Public identifier of the DOCTYPE.
 175       * @param string|null $system_identifier System identifier of the DOCTYPE.
 176       * @param bool        $force_quirks_flag Whether the force-quirks flag is set for the token.
 177       */
 178  	private function __construct(
 179          ?string $name,
 180          ?string $public_identifier,
 181          ?string $system_identifier,
 182          bool $force_quirks_flag
 183      ) {
 184          $this->name              = $name;
 185          $this->public_identifier = $public_identifier;
 186          $this->system_identifier = $system_identifier;
 187  
 188          /*
 189           * > If the DOCTYPE token matches one of the conditions in the following list,
 190           * > then set the Document to quirks mode:
 191           */
 192  
 193          /*
 194           * > The force-quirks flag is set to on.
 195           */
 196          if ( $force_quirks_flag ) {
 197              $this->indicated_compatability_mode = 'quirks';
 198              return;
 199          }
 200  
 201          /*
 202           * Normative documents will contain the literal `<!DOCTYPE html>` with no
 203           * public or system identifiers; short-circuit to avoid extra parsing.
 204           */
 205          if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
 206              $this->indicated_compatability_mode = 'no-quirks';
 207              return;
 208          }
 209  
 210          /*
 211           * > The name is not "html".
 212           *
 213           * The tokenizer must report the name in lower case even if provided in
 214           * the document in upper case; thus no conversion is required here.
 215           */
 216          if ( 'html' !== $name ) {
 217              $this->indicated_compatability_mode = 'quirks';
 218              return;
 219          }
 220  
 221          /*
 222           * Set up some variables to handle the rest of the conditions.
 223           *
 224           * > set...the public identifier...to...the empty string if the public identifier was missing.
 225           * > set...the system identifier...to...the empty string if the system identifier was missing.
 226           * >
 227           * > The system identifier and public identifier strings must be compared...
 228           * > in an ASCII case-insensitive manner.
 229           * >
 230           * > A system identifier whose value is the empty string is not considered missing
 231           * > for the purposes of the conditions above.
 232           */
 233          $system_identifier_is_missing = null === $system_identifier;
 234          $public_identifier            = null === $public_identifier ? '' : strtolower( $public_identifier );
 235          $system_identifier            = null === $system_identifier ? '' : strtolower( $system_identifier );
 236  
 237          /*
 238           * > The public identifier is set to…
 239           */
 240          if (
 241              '-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier ||
 242              '-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
 243              'html' === $public_identifier
 244          ) {
 245              $this->indicated_compatability_mode = 'quirks';
 246              return;
 247          }
 248  
 249          /*
 250           * > The system identifier is set to…
 251           */
 252          if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
 253              $this->indicated_compatability_mode = 'quirks';
 254              return;
 255          }
 256  
 257          /*
 258           * All of the following conditions depend on matching the public identifier.
 259           * If the public identifier is empty, none of the following conditions will match.
 260           */
 261          if ( '' === $public_identifier ) {
 262              $this->indicated_compatability_mode = 'no-quirks';
 263              return;
 264          }
 265  
 266          /*
 267           * > The public identifier starts with…
 268           *
 269           * @todo Optimize this matching. It shouldn't be a large overall performance issue,
 270           *       however, as only a single DOCTYPE declaration token should ever be parsed,
 271           *       and normative documents will have exited before reaching this condition.
 272           */
 273          if (
 274              str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) ||
 275              str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) ||
 276              str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) ||
 277              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) ||
 278              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) ||
 279              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) ||
 280              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) ||
 281              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) ||
 282              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) ||
 283              str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) ||
 284              str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) ||
 285              str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) ||
 286              str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) ||
 287              str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) ||
 288              str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) ||
 289              str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) ||
 290              str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) ||
 291              str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) ||
 292              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) ||
 293              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) ||
 294              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) ||
 295              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) ||
 296              str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) ||
 297              str_starts_with( $public_identifier, '-//ietf//dtd html//' ) ||
 298              str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) ||
 299              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) ||
 300              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) ||
 301              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) ||
 302              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) ||
 303              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) ||
 304              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) ||
 305              str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) ||
 306              str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) ||
 307              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) ||
 308              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) ||
 309              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) ||
 310              str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) ||
 311              str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) ||
 312              str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) ||
 313              str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) ||
 314              str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) ||
 315              str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) ||
 316              str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) ||
 317              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) ||
 318              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) ||
 319              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) ||
 320              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) ||
 321              str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) ||
 322              str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) ||
 323              str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) ||
 324              str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) ||
 325              str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) ||
 326              str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) ||
 327              str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
 328              str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
 329          ) {
 330              $this->indicated_compatability_mode = 'quirks';
 331              return;
 332          }
 333  
 334          /*
 335           * > The system identifier is missing and the public identifier starts with…
 336           */
 337          if (
 338              $system_identifier_is_missing && (
 339                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
 340                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 341              )
 342          ) {
 343              $this->indicated_compatability_mode = 'quirks';
 344              return;
 345          }
 346  
 347          /*
 348           * > Otherwise, if the DOCTYPE token matches one of the conditions in
 349           * > the following list, then set the Document to limited-quirks mode.
 350           */
 351  
 352          /*
 353           * > The public identifier starts with…
 354           */
 355          if (
 356              str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
 357              str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
 358          ) {
 359              $this->indicated_compatability_mode = 'limited-quirks';
 360              return;
 361          }
 362  
 363          /*
 364           * > The system identifier is not missing and the public identifier starts with…
 365           */
 366          if (
 367              ! $system_identifier_is_missing && (
 368                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
 369                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 370              )
 371          ) {
 372              $this->indicated_compatability_mode = 'limited-quirks';
 373              return;
 374          }
 375  
 376          $this->indicated_compatability_mode = 'no-quirks';
 377      }
 378  
 379      /**
 380       * Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token.
 381       *
 382       * Use this method to parse a DOCTYPE declaration token and get access to its properties
 383       * via the returned WP_HTML_Doctype_Info class instance. The provided input must parse
 384       * properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE.
 385       *
 386       * Example:
 387       *
 388       *     // Normative HTML DOCTYPE declaration.
 389       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
 390       *     'no-quirks' === $doctype->indicated_compatability_mode;
 391       *
 392       *     // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
 393       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
 394       *     'quirks' === $doctype->indicated_compatability_mode;
 395       *
 396       *     // Textual quirks present in raw HTML are handled appropriately.
 397       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
 398       *     'no-quirks' === $doctype->indicated_compatability_mode;
 399       *
 400       *     // Anything other than a proper DOCTYPE declaration token fails to parse.
 401       *     null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
 402       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE ><p>' );
 403       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!TYPEDOC>' );
 404       *     null === WP_HTML_Doctype_Info::from_doctype_token( 'html' );
 405       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<?xml version="1.0" encoding="UTF-8" ?>' );
 406       *
 407       * @since 6.7.0
 408       *
 409       * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. `<!DOCTYPE html>`.
 410       *
 411       * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the
 412       *                                   provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null.
 413       */
 414  	public static function from_doctype_token( string $doctype_html ): ?self {
 415          $doctype_name      = null;
 416          $doctype_public_id = null;
 417          $doctype_system_id = null;
 418  
 419          $end = strlen( $doctype_html ) - 1;
 420  
 421          /*
 422           * This parser combines the rules for parsing DOCTYPE tokens found in the HTML
 423           * specification for the DOCTYPE related tokenizer states.
 424           *
 425           * @see https://html.spec.whatwg.org/#doctype-state
 426           */
 427  
 428          /*
 429           * - Valid DOCTYPE HTML token must be at least `<!DOCTYPE>` assuming a complete token not
 430           *   ending in end-of-file.
 431           * - It must start with an ASCII case-insensitive match for `<!DOCTYPE`.
 432           * - The only occurrence of `>` must be the final byte in the HTML string.
 433           */
 434          if (
 435              $end < 9 ||
 436              0 !== substr_compare( $doctype_html, '<!DOCTYPE', 0, 9, true )
 437          ) {
 438              return null;
 439          }
 440  
 441          $at = 9;
 442          // Is there one and only one `>`?
 443          if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) {
 444              return null;
 445          }
 446  
 447          /*
 448           * Perform newline normalization and ensure the $end value is correct after normalization.
 449           *
 450           * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
 451           * @see https://infra.spec.whatwg.org/#normalize-newlines
 452           */
 453          $doctype_html = str_replace( "\r\n", "\n", $doctype_html );
 454          $doctype_html = str_replace( "\r", "\n", $doctype_html );
 455          $end          = strlen( $doctype_html ) - 1;
 456  
 457          /*
 458           * In this state, the doctype token has been found and its "content" optionally including the
 459           * name, public identifier, and system identifier is between the current position and the end.
 460           *
 461           *     "<!DOCTYPE...declaration...>"
 462           *               ╰─ $at           ╰─ $end
 463           *
 464           * It's also possible that the declaration part is empty.
 465           *
 466           *               ╭─ $at
 467           *     "<!DOCTYPE>"
 468           *               ╰─ $end
 469           *
 470           * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they
 471           * have been handled above in the condition that the provided DOCTYPE HTML must contain
 472           * exactly one ">" character in the final position.
 473           */
 474  
 475          /*
 476           *
 477           * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and
 478           * proceed to the next state.
 479           *
 480           * @see https://html.spec.whatwg.org/#before-doctype-name-state
 481           */
 482          $at += strspn( $doctype_html, " \t\n\f\r", $at );
 483  
 484          if ( $at >= $end ) {
 485              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 486          }
 487  
 488          $name_length  = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 489          $doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) );
 490  
 491          $at += $name_length;
 492          $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 493          if ( $at >= $end ) {
 494              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 495          }
 496  
 497          /*
 498           * "After DOCTYPE name state"
 499           *
 500           * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point.
 501           * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype).
 502           *
 503           * @see https://html.spec.whatwg.org/#after-doctype-name-state
 504           */
 505          if ( $at + 6 >= $end ) {
 506              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 507          }
 508  
 509          /*
 510           * > If the six characters starting from the current input character are an ASCII
 511           * > case-insensitive match for the word "PUBLIC", then consume those characters
 512           * > and switch to the after DOCTYPE public keyword state.
 513           */
 514          if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) {
 515              $at += 6;
 516              $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 517              if ( $at >= $end ) {
 518                  return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 519              }
 520              goto parse_doctype_public_identifier;
 521          }
 522  
 523          /*
 524           * > Otherwise, if the six characters starting from the current input character are an ASCII
 525           * > case-insensitive match for the word "SYSTEM", then consume those characters and switch
 526           * > to the after DOCTYPE system keyword state.
 527           */
 528          if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) {
 529              $at += 6;
 530              $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 531              if ( $at >= $end ) {
 532                  return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 533              }
 534              goto parse_doctype_system_identifier;
 535          }
 536  
 537          /*
 538           * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error.
 539           * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus
 540           * > DOCTYPE state.
 541           */
 542          return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 543  
 544          parse_doctype_public_identifier:
 545          /*
 546           * The parser should enter "DOCTYPE public identifier (double-quoted) state" or
 547           * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes.
 548           * Anything else forces quirks mode and ignores the rest of the contents.
 549           *
 550           * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state
 551           * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state
 552           */
 553          $closer_quote = $doctype_html[ $at ];
 554  
 555          /*
 556           * > This is a missing-quote-before-doctype-public-identifier parse error. Set the
 557           * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
 558           */
 559          if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
 560              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 561          }
 562  
 563          ++$at;
 564  
 565          $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
 566          $doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
 567  
 568          $at += $identifier_length;
 569          if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
 570              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 571          }
 572  
 573          ++$at;
 574  
 575          /*
 576           * "Between DOCTYPE public and system identifiers state"
 577           *
 578           * Advance through whitespace between public and system identifiers.
 579           *
 580           * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state
 581           */
 582          $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 583          if ( $at >= $end ) {
 584              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 585          }
 586  
 587          parse_doctype_system_identifier:
 588          /*
 589           * The parser should enter "DOCTYPE system identifier (double-quoted) state" or
 590           * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes.
 591           * Anything else forces quirks mode and ignores the rest of the contents.
 592           *
 593           * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state
 594           * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state
 595           */
 596          $closer_quote = $doctype_html[ $at ];
 597  
 598          /*
 599           * > This is a missing-quote-before-doctype-system-identifier parse error. Set the
 600           * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
 601           */
 602          if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
 603              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 604          }
 605  
 606          ++$at;
 607  
 608          $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
 609          $doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
 610  
 611          $at += $identifier_length;
 612          if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
 613              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 614          }
 615  
 616          return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 617      }
 618  }


Generated : Wed Aug 13 08:20:01 2025 Cross-referenced by PHPXref