[ Index ]

PHP Cross Reference of WordPress Trunk (Updated Daily)

Search

title

Body

[close]

/wp-includes/html-api/ -> class-wp-html-doctype-info.php (source)

   1  <?php
   2  /**
   3   * HTML API: WP_HTML_Doctype_Info class
   4   *
   5   * @package WordPress
   6   * @subpackage HTML-API
   7   * @since 6.7.0
   8   */
   9  
  10  /**
  11   * Core class used by the HTML API to represent a DOCTYPE declaration.
  12   *
  13   * This class parses DOCTYPE tokens for the full parser in the HTML Processor.
  14   * Most code interacting with HTML won't need to parse DOCTYPE declarations;
  15   * the HTML Processor is one exception. Consult the HTML Processor for proper
  16   * parsing of an HTML document.
  17   *
  18   * A DOCTYPE declaration may indicate its document compatibility mode, which impacts
  19   * the structure of the following HTML as well as the behavior of CSS class selectors.
  20   * There are three possible modes:
  21   *
  22   *  - "no-quirks" and "limited-quirks" modes (also called "standards mode").
  23   *  - "quirks" mode.
  24   *
  25   * These modes mostly determine whether CSS class name selectors match values in the
  26   * HTML `class` attribute in an ASCII-case-insensitive way (quirks mode), or whether
  27   * they match only when byte-for-byte identical (no-quirks mode).
  28   *
  29   * All HTML documents should start with the standard HTML5 DOCTYPE: `<!DOCTYPE html>`.
  30   *
  31   * > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different
  32   * > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a
  33   * > document ensures that the browser makes a best-effort attempt at following the
  34   * > relevant specifications.
  35   *
  36   * @see https://html.spec.whatwg.org/#the-doctype
  37   *
  38   * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
  39   * and an indication of which document compatibility mode they would imply if an HTML parser
  40   * hadn't already determined it from other information.
  41   *
  42   * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
  43   *
  44   * Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how
  45   * to interpret the various tags and entities within a document. Its role in HTML diverged
  46   * from how it was used in SGML and no meaning should be back-read into HTML based on how it
  47   * is used in SGML, XML, or XHTML documents.
  48   *
  49   * @see https://www.iso.org/standard/16387.html
  50   *
  51   * @since 6.7.0
  52   *
  53   * @access private
  54   *
  55   * @see WP_HTML_Processor
  56   */
  57  class WP_HTML_Doctype_Info {
  58      /**
  59       * Name of the DOCTYPE: should be "html" for HTML documents.
  60       *
  61       * This value should be considered "read only" and not modified.
  62       *
  63       * Historically the DOCTYPE name indicates name of the document's root element.
  64       *
  65       *     <!DOCTYPE html>
  66       *               ╰──┴── name is "html".
  67       *
  68       * @see https://html.spec.whatwg.org/#tokenization
  69       *
  70       * @since 6.7.0
  71       *
  72       * @var string|null
  73       */
  74      public $name = null;
  75  
  76      /**
  77       * Public identifier of the DOCTYPE.
  78       *
  79       * This value should be considered "read only" and not modified.
  80       *
  81       * The public identifier is optional and should not appear in HTML documents.
  82       * A `null` value indicates that no public identifier was present in the DOCTYPE.
  83       *
  84       * Historically the presence of the public identifier indicated that a document
  85       * was meant to be shared between computer systems and the value indicated to a
  86       * knowledgeable parser how to find the relevant document type definition (DTD).
  87       *
  88       *     <!DOCTYPE html PUBLIC "public id goes here in quotes">
  89       *               │  │         ╰─── public identifier ─────╯
  90       *               ╰──┴── name is "html".
  91       *
  92       * @see https://html.spec.whatwg.org/#tokenization
  93       *
  94       * @since 6.7.0
  95       *
  96       * @var string|null
  97       */
  98      public $public_identifier = null;
  99  
 100      /**
 101       * System identifier of the DOCTYPE.
 102       *
 103       * This value should be considered "read only" and not modified.
 104       *
 105       * The system identifier is optional and should not appear in HTML documents.
 106       * A `null` value indicates that no system identifier was present in the DOCTYPE.
 107       *
 108       * Historically the system identifier specified where a relevant document type
 109       * declaration for the given document is stored and may be retrieved.
 110       *
 111       *     <!DOCTYPE html SYSTEM "system id goes here in quotes">
 112       *               │  │         ╰──── system identifier ────╯
 113       *               ╰──┴── name is "html".
 114       *
 115       * If a public identifier were provided it would indicate to a knowledgeable
 116       * parser how to interpret the system identifier.
 117       *
 118       *     <!DOCTYPE html PUBLIC "public id goes here in quotes" "system id goes here in quotes">
 119       *               │  │         ╰─── public identifier ─────╯   ╰──── system identifier ────╯
 120       *               ╰──┴── name is "html".
 121       *
 122       * @see https://html.spec.whatwg.org/#tokenization
 123       *
 124       * @since 6.7.0
 125       *
 126       * @var string|null
 127       */
 128      public $system_identifier = null;
 129  
 130      /**
 131       * Which document compatibility mode this DOCTYPE declaration indicates.
 132       *
 133       * This value should be considered "read only" and not modified.
 134       *
 135       * When an HTML parser has not already set the document compatibility mode,
 136       * (e.g. "quirks" or "no-quirks" mode), it will be inferred from the properties
 137       * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
 138       * indicate one of three possible document compatibility modes:
 139       * "no-quirks", "limited-quirks", or "quirks".
 140       *
 141       * Browsers expose the resulting document mode via `document.compatMode`:
 142       * - "BackCompat" indicates "quirks" mode.
 143       * - "CSS1Compat" indicates "no-quirks" or "limited-quirks" (these modes are not
 144       *   distinguished by `document.compatMode`).
 145       *
 146       * An appropriate DOCTYPE is one encountered in the "initial" insertion mode,
 147       * before the HTML element has been opened and before finding any other
 148       * DOCTYPE declaration tokens.
 149       *
 150       * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
 151       *
 152       * @since 6.7.0
 153       *
 154       * @var string One of "no-quirks", "limited-quirks", or "quirks".
 155       */
 156      public $indicated_compatibility_mode;
 157  
 158      /**
 159       * Constructor.
 160       *
 161       * This class should not be instantiated directly.
 162       * Use the static {@see self::from_doctype_token} method instead.
 163       *
 164       * The arguments to this constructor correspond to the "DOCTYPE token"
 165       * as defined in the HTML specification.
 166       *
 167       * > DOCTYPE tokens have a name, a public identifier, a system identifier,
 168       * > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier,
 169       * > and system identifier must be marked as missing (which is a distinct state from the
 170       * > empty string), and the force-quirks flag must be set to off (its other state is on).
 171       *
 172       * @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization
 173       *
 174       * @since 6.7.0
 175       *
 176       * @param string|null $name              Name of the DOCTYPE.
 177       * @param string|null $public_identifier Public identifier of the DOCTYPE.
 178       * @param string|null $system_identifier System identifier of the DOCTYPE.
 179       * @param bool        $force_quirks_flag Whether the force-quirks flag is set for the token.
 180       */
 181  	private function __construct(
 182          ?string $name,
 183          ?string $public_identifier,
 184          ?string $system_identifier,
 185          bool $force_quirks_flag
 186      ) {
 187          $this->name              = $name;
 188          $this->public_identifier = $public_identifier;
 189          $this->system_identifier = $system_identifier;
 190  
 191          /*
 192           * > If the DOCTYPE token matches one of the conditions in the following list,
 193           * > then set the Document to quirks mode:
 194           */
 195  
 196          /*
 197           * > The force-quirks flag is set to on.
 198           */
 199          if ( $force_quirks_flag ) {
 200              $this->indicated_compatibility_mode = 'quirks';
 201              return;
 202          }
 203  
 204          /*
 205           * Normative documents will contain the literal `<!DOCTYPE html>` with no
 206           * public or system identifiers; short-circuit to avoid extra parsing.
 207           */
 208          if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
 209              $this->indicated_compatibility_mode = 'no-quirks';
 210              return;
 211          }
 212  
 213          /*
 214           * > The name is not "html".
 215           *
 216           * The tokenizer must report the name in lower case even if provided in
 217           * the document in upper case; thus no conversion is required here.
 218           */
 219          if ( 'html' !== $name ) {
 220              $this->indicated_compatibility_mode = 'quirks';
 221              return;
 222          }
 223  
 224          /*
 225           * Set up some variables to handle the rest of the conditions.
 226           *
 227           * > set...the public identifier...to...the empty string if the public identifier was missing.
 228           * > set...the system identifier...to...the empty string if the system identifier was missing.
 229           * >
 230           * > The system identifier and public identifier strings must be compared...
 231           * > in an ASCII case-insensitive manner.
 232           * >
 233           * > A system identifier whose value is the empty string is not considered missing
 234           * > for the purposes of the conditions above.
 235           */
 236          $system_identifier_is_missing = null === $system_identifier;
 237          $public_identifier            = null === $public_identifier ? '' : strtolower( $public_identifier );
 238          $system_identifier            = null === $system_identifier ? '' : strtolower( $system_identifier );
 239  
 240          /*
 241           * > The public identifier is set to…
 242           */
 243          if (
 244              '-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier ||
 245              '-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
 246              'html' === $public_identifier
 247          ) {
 248              $this->indicated_compatibility_mode = 'quirks';
 249              return;
 250          }
 251  
 252          /*
 253           * > The system identifier is set to…
 254           */
 255          if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
 256              $this->indicated_compatibility_mode = 'quirks';
 257              return;
 258          }
 259  
 260          /*
 261           * All of the following conditions depend on matching the public identifier.
 262           * If the public identifier is empty, none of the following conditions will match.
 263           */
 264          if ( '' === $public_identifier ) {
 265              $this->indicated_compatibility_mode = 'no-quirks';
 266              return;
 267          }
 268  
 269          /*
 270           * > The public identifier starts with…
 271           *
 272           * @todo Optimize this matching. It shouldn't be a large overall performance issue,
 273           *       however, as only a single DOCTYPE declaration token should ever be parsed,
 274           *       and normative documents will have exited before reaching this condition.
 275           */
 276          if (
 277              str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) ||
 278              str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) ||
 279              str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) ||
 280              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) ||
 281              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) ||
 282              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) ||
 283              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) ||
 284              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) ||
 285              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) ||
 286              str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) ||
 287              str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) ||
 288              str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) ||
 289              str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) ||
 290              str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) ||
 291              str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) ||
 292              str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) ||
 293              str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) ||
 294              str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) ||
 295              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) ||
 296              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) ||
 297              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) ||
 298              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) ||
 299              str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) ||
 300              str_starts_with( $public_identifier, '-//ietf//dtd html//' ) ||
 301              str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) ||
 302              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) ||
 303              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) ||
 304              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) ||
 305              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) ||
 306              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) ||
 307              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) ||
 308              str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) ||
 309              str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) ||
 310              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) ||
 311              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) ||
 312              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) ||
 313              str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) ||
 314              str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) ||
 315              str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) ||
 316              str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) ||
 317              str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) ||
 318              str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) ||
 319              str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) ||
 320              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) ||
 321              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) ||
 322              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) ||
 323              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) ||
 324              str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) ||
 325              str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) ||
 326              str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) ||
 327              str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) ||
 328              str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) ||
 329              str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) ||
 330              str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
 331              str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
 332          ) {
 333              $this->indicated_compatibility_mode = 'quirks';
 334              return;
 335          }
 336  
 337          /*
 338           * > The system identifier is missing and the public identifier starts with…
 339           */
 340          if (
 341              $system_identifier_is_missing && (
 342                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
 343                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 344              )
 345          ) {
 346              $this->indicated_compatibility_mode = 'quirks';
 347              return;
 348          }
 349  
 350          /*
 351           * > Otherwise, if the DOCTYPE token matches one of the conditions in
 352           * > the following list, then set the Document to limited-quirks mode.
 353           */
 354  
 355          /*
 356           * > The public identifier starts with…
 357           */
 358          if (
 359              str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
 360              str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
 361          ) {
 362              $this->indicated_compatibility_mode = 'limited-quirks';
 363              return;
 364          }
 365  
 366          /*
 367           * > The system identifier is not missing and the public identifier starts with…
 368           */
 369          if (
 370              ! $system_identifier_is_missing && (
 371                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
 372                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 373              )
 374          ) {
 375              $this->indicated_compatibility_mode = 'limited-quirks';
 376              return;
 377          }
 378  
 379          $this->indicated_compatibility_mode = 'no-quirks';
 380      }
 381  
 382      /**
 383       * Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token.
 384       *
 385       * Use this method to parse a DOCTYPE declaration token and get access to its properties
 386       * via the returned WP_HTML_Doctype_Info class instance. The provided input must parse
 387       * properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE.
 388       *
 389       * Example:
 390       *
 391       *     // Normative HTML DOCTYPE declaration.
 392       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
 393       *     'no-quirks' === $doctype->indicated_compatibility_mode;
 394       *
 395       *     // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
 396       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
 397       *     'quirks' === $doctype->indicated_compatibility_mode;
 398       *
 399       *     // Textual quirks present in raw HTML are handled appropriately.
 400       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
 401       *     'no-quirks' === $doctype->indicated_compatibility_mode;
 402       *
 403       *     // Anything other than a proper DOCTYPE declaration token fails to parse.
 404       *     null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
 405       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE ><p>' );
 406       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!TYPEDOC>' );
 407       *     null === WP_HTML_Doctype_Info::from_doctype_token( 'html' );
 408       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<?xml version="1.0" encoding="UTF-8" ?>' );
 409       *
 410       * @since 6.7.0
 411       *
 412       * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. `<!DOCTYPE html>`.
 413       *
 414       * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the
 415       *                                   provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null.
 416       */
 417  	public static function from_doctype_token( string $doctype_html ): ?self {
 418          $doctype_name      = null;
 419          $doctype_public_id = null;
 420          $doctype_system_id = null;
 421  
 422          $end = strlen( $doctype_html ) - 1;
 423  
 424          /*
 425           * This parser combines the rules for parsing DOCTYPE tokens found in the HTML
 426           * specification for the DOCTYPE related tokenizer states.
 427           *
 428           * @see https://html.spec.whatwg.org/#doctype-state
 429           */
 430  
 431          /*
 432           * - Valid DOCTYPE HTML token must be at least `<!DOCTYPE>` assuming a complete token not
 433           *   ending in end-of-file.
 434           * - It must start with an ASCII case-insensitive match for `<!DOCTYPE`.
 435           * - The only occurrence of `>` must be the final byte in the HTML string.
 436           */
 437          if (
 438              $end < 9 ||
 439              0 !== substr_compare( $doctype_html, '<!DOCTYPE', 0, 9, true )
 440          ) {
 441              return null;
 442          }
 443  
 444          $at = 9;
 445          // Is there one and only one `>`?
 446          if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) {
 447              return null;
 448          }
 449  
 450          /*
 451           * Perform newline normalization and ensure the $end value is correct after normalization.
 452           *
 453           * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
 454           * @see https://infra.spec.whatwg.org/#normalize-newlines
 455           */
 456          $doctype_html = str_replace( "\r\n", "\n", $doctype_html );
 457          $doctype_html = str_replace( "\r", "\n", $doctype_html );
 458          $end          = strlen( $doctype_html ) - 1;
 459  
 460          /*
 461           * In this state, the doctype token has been found and its "content" optionally including the
 462           * name, public identifier, and system identifier is between the current position and the end.
 463           *
 464           *     "<!DOCTYPE...declaration...>"
 465           *               ╰─ $at           ╰─ $end
 466           *
 467           * It's also possible that the declaration part is empty.
 468           *
 469           *               ╭─ $at
 470           *     "<!DOCTYPE>"
 471           *               ╰─ $end
 472           *
 473           * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they
 474           * have been handled above in the condition that the provided DOCTYPE HTML must contain
 475           * exactly one ">" character in the final position.
 476           */
 477  
 478          /*
 479           *
 480           * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and
 481           * proceed to the next state.
 482           *
 483           * @see https://html.spec.whatwg.org/#before-doctype-name-state
 484           */
 485          $at += strspn( $doctype_html, " \t\n\f\r", $at );
 486  
 487          if ( $at >= $end ) {
 488              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 489          }
 490  
 491          $name_length  = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 492          $doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) );
 493  
 494          $at += $name_length;
 495          $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 496          if ( $at >= $end ) {
 497              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 498          }
 499  
 500          /*
 501           * "After DOCTYPE name state"
 502           *
 503           * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point.
 504           * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype).
 505           *
 506           * @see https://html.spec.whatwg.org/#after-doctype-name-state
 507           */
 508          if ( $at + 6 >= $end ) {
 509              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 510          }
 511  
 512          /*
 513           * > If the six characters starting from the current input character are an ASCII
 514           * > case-insensitive match for the word "PUBLIC", then consume those characters
 515           * > and switch to the after DOCTYPE public keyword state.
 516           */
 517          if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) {
 518              $at += 6;
 519              $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 520              if ( $at >= $end ) {
 521                  return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 522              }
 523              goto parse_doctype_public_identifier;
 524          }
 525  
 526          /*
 527           * > Otherwise, if the six characters starting from the current input character are an ASCII
 528           * > case-insensitive match for the word "SYSTEM", then consume those characters and switch
 529           * > to the after DOCTYPE system keyword state.
 530           */
 531          if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) {
 532              $at += 6;
 533              $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 534              if ( $at >= $end ) {
 535                  return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 536              }
 537              goto parse_doctype_system_identifier;
 538          }
 539  
 540          /*
 541           * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error.
 542           * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus
 543           * > DOCTYPE state.
 544           */
 545          return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 546  
 547          parse_doctype_public_identifier:
 548          /*
 549           * The parser should enter "DOCTYPE public identifier (double-quoted) state" or
 550           * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes.
 551           * Anything else forces quirks mode and ignores the rest of the contents.
 552           *
 553           * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state
 554           * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state
 555           */
 556          $closer_quote = $doctype_html[ $at ];
 557  
 558          /*
 559           * > This is a missing-quote-before-doctype-public-identifier parse error. Set the
 560           * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
 561           */
 562          if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
 563              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 564          }
 565  
 566          ++$at;
 567  
 568          $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
 569          $doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
 570  
 571          $at += $identifier_length;
 572          if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
 573              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 574          }
 575  
 576          ++$at;
 577  
 578          /*
 579           * "Between DOCTYPE public and system identifiers state"
 580           *
 581           * Advance through whitespace between public and system identifiers.
 582           *
 583           * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state
 584           */
 585          $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 586          if ( $at >= $end ) {
 587              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 588          }
 589  
 590          parse_doctype_system_identifier:
 591          /*
 592           * The parser should enter "DOCTYPE system identifier (double-quoted) state" or
 593           * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes.
 594           * Anything else forces quirks mode and ignores the rest of the contents.
 595           *
 596           * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state
 597           * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state
 598           */
 599          $closer_quote = $doctype_html[ $at ];
 600  
 601          /*
 602           * > This is a missing-quote-before-doctype-system-identifier parse error. Set the
 603           * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
 604           */
 605          if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
 606              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 607          }
 608  
 609          ++$at;
 610  
 611          $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
 612          $doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
 613  
 614          $at += $identifier_length;
 615          if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
 616              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 617          }
 618  
 619          return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 620      }
 621  }


Generated : Wed Jun 24 08:20:11 2026 Cross-referenced by PHPXref