[ Index ]

PHP Cross Reference of WordPress Trunk (Updated Daily)

Search

title

Body

[close]

/wp-includes/html-api/ -> class-wp-html-doctype-info.php (source)

   1  <?php
   2  /**
   3   * HTML API: WP_HTML_Doctype_Info class
   4   *
   5   * @package WordPress
   6   * @subpackage HTML-API
   7   * @since 6.7.0
   8   */
   9  
  10  /**
  11   * Core class used by the HTML API to represent a DOCTYPE declaration.
  12   *
  13   * This class parses DOCTYPE tokens for the full parser in the HTML Processor.
  14   * Most code interacting with HTML won't need to parse DOCTYPE declarations;
  15   * the HTML Processor is one exception. Consult the HTML Processor for proper
  16   * parsing of an HTML document.
  17   *
  18   * A DOCTYPE declaration may indicate its document compatibility mode, which impacts
  19   * the structure of the following HTML as well as the behavior of CSS class selectors.
  20   * There are three possible modes:
  21   *
  22   *  - "no-quirks" and "limited-quirks" modes (also called "standards mode").
  23   *  - "quirks" mode.
  24   *
  25   * These modes mostly determine whether CSS class name selectors match values in the
  26   * HTML `class` attribute in an ASCII-case-insensitive way (quirks mode), or whether
  27   * they match only when byte-for-byte identical (no-quirks mode).
  28   *
  29   * All HTML documents should start with the standard HTML5 DOCTYPE: `<!DOCTYPE html>`.
  30   *
  31   * > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different
  32   * > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a
  33   * > document ensures that the browser makes a best-effort attempt at following the
  34   * > relevant specifications.
  35   *
  36   * @see https://html.spec.whatwg.org/#the-doctype
  37   *
  38   * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
  39   * and an indication of which document compatability mode they would imply if an HTML parser
  40   * hadn't already determined it from other information.
  41   *
  42   * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
  43   *
  44   * Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how
  45   * to interpret the various tags and entities within a document. Its role in HTML diverged
  46   * from how it was used in SGML and no meaning should be back-read into HTML based on how it
  47   * is used in SGML, XML, or XHTML documents.
  48   *
  49   * @see https://www.iso.org/standard/16387.html
  50   *
  51   * @since 6.7.0
  52   *
  53   * @see WP_HTML_Processor
  54   */
  55  class WP_HTML_Doctype_Info {
  56      /**
  57       * Name of the DOCTYPE: should be "html" for HTML documents.
  58       *
  59       * This value should be considered "read only" and not modified.
  60       *
  61       * Historically the DOCTYPE name indicates name of the document's root element.
  62       *
  63       *     <!DOCTYPE html>
  64       *               ╰──┴── name is "html".
  65       *
  66       * @see https://html.spec.whatwg.org/#tokenization
  67       *
  68       * @since 6.7.0
  69       *
  70       * @var string|null
  71       */
  72      public $name = null;
  73  
  74      /**
  75       * Public identifier of the DOCTYPE.
  76       *
  77       * This value should be considered "read only" and not modified.
  78       *
  79       * The public identifier is optional and should not appear in HTML documents.
  80       * A `null` value indicates that no public identifier was present in the DOCTYPE.
  81       *
  82       * Historically the presence of the public identifier indicated that a document
  83       * was meant to be shared between computer systems and the value indicated to a
  84       * knowledgeable parser how to find the relevant document type definition (DTD).
  85       *
  86       *     <!DOCTYPE html PUBLIC "public id goes here in quotes">
  87       *               │  │         ╰─── public identifier ─────╯
  88       *               ╰──┴── name is "html".
  89       *
  90       * @see https://html.spec.whatwg.org/#tokenization
  91       *
  92       * @since 6.7.0
  93       *
  94       * @var string|null
  95       */
  96      public $public_identifier = null;
  97  
  98      /**
  99       * System identifier of the DOCTYPE.
 100       *
 101       * This value should be considered "read only" and not modified.
 102       *
 103       * The system identifier is optional and should not appear in HTML documents.
 104       * A `null` value indicates that no system identifier was present in the DOCTYPE.
 105       *
 106       * Historically the system identifier specified where a relevant document type
 107       * declaration for the given document is stored and may be retrieved.
 108       *
 109       *     <!DOCTYPE html SYSTEM "system id goes here in quotes">
 110       *               │  │         ╰──── system identifier ────╯
 111       *               ╰──┴── name is "html".
 112       *
 113       * If a public identifier were provided it would indicate to a knowledgeable
 114       * parser how to interpret the system identifier.
 115       *
 116       *     <!DOCTYPE html PUBLIC "public id goes here in quotes" "system id goes here in quotes">
 117       *               │  │         ╰─── public identifier ─────╯   ╰──── system identifier ────╯
 118       *               ╰──┴── name is "html".
 119       *
 120       * @see https://html.spec.whatwg.org/#tokenization
 121       *
 122       * @since 6.7.0
 123       *
 124       * @var string|null
 125       */
 126      public $system_identifier = null;
 127  
 128      /**
 129       * Which document compatability mode this DOCTYPE declaration indicates.
 130       *
 131       * This value should be considered "read only" and not modified.
 132       *
 133       * When an HTML parser has not already set the document compatability mode,
 134       * (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
 135       * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
 136       * indicate one of three possible document compatability modes:
 137       *
 138       *  - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
 139       *  - "quirks" mode (also called `CSS1Compat` mode).
 140       *
 141       * An appropriate DOCTYPE is one encountered in the "initial" insertion mode,
 142       * before the HTML element has been opened and before finding any other
 143       * DOCTYPE declaration tokens.
 144       *
 145       * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
 146       *
 147       * @since 6.7.0
 148       *
 149       * @var string One of "no-quirks", "limited-quirks", or "quirks".
 150       */
 151      public $indicated_compatability_mode;
 152  
 153      /**
 154       * Constructor.
 155       *
 156       * This class should not be instantiated directly.
 157       * Use the static {@see self::from_doctype_token} method instead.
 158       *
 159       * The arguments to this constructor correspond to the "DOCTYPE token"
 160       * as defined in the HTML specification.
 161       *
 162       * > DOCTYPE tokens have a name, a public identifier, a system identifier,
 163       * > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier,
 164       * > and system identifier must be marked as missing (which is a distinct state from the
 165       * > empty string), and the force-quirks flag must be set to off (its other state is on).
 166       *
 167       * @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization
 168       *
 169       * @since 6.7.0
 170       *
 171       * @param string|null $name              Name of the DOCTYPE.
 172       * @param string|null $public_identifier Public identifier of the DOCTYPE.
 173       * @param string|null $system_identifier System identifier of the DOCTYPE.
 174       * @param bool        $force_quirks_flag Whether the force-quirks flag is set for the token.
 175       */
 176  	private function __construct(
 177          ?string $name,
 178          ?string $public_identifier,
 179          ?string $system_identifier,
 180          bool $force_quirks_flag
 181      ) {
 182          $this->name              = $name;
 183          $this->public_identifier = $public_identifier;
 184          $this->system_identifier = $system_identifier;
 185  
 186          /*
 187           * > If the DOCTYPE token matches one of the conditions in the following list,
 188           * > then set the Document to quirks mode:
 189           */
 190  
 191          /*
 192           * > The force-quirks flag is set to on.
 193           */
 194          if ( $force_quirks_flag ) {
 195              $this->indicated_compatability_mode = 'quirks';
 196              return;
 197          }
 198  
 199          /*
 200           * Normative documents will contain the literal `<!DOCTYPE html>` with no
 201           * public or system identifiers; short-circuit to avoid extra parsing.
 202           */
 203          if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
 204              $this->indicated_compatability_mode = 'no-quirks';
 205              return;
 206          }
 207  
 208          /*
 209           * > The name is not "html".
 210           *
 211           * The tokenizer must report the name in lower case even if provided in
 212           * the document in upper case; thus no conversion is required here.
 213           */
 214          if ( 'html' !== $name ) {
 215              $this->indicated_compatability_mode = 'quirks';
 216              return;
 217          }
 218  
 219          /*
 220           * Set up some variables to handle the rest of the conditions.
 221           *
 222           * > set...the public identifier...to...the empty string if the public identifier was missing.
 223           * > set...the system identifier...to...the empty string if the system identifier was missing.
 224           * >
 225           * > The system identifier and public identifier strings must be compared...
 226           * > in an ASCII case-insensitive manner.
 227           * >
 228           * > A system identifier whose value is the empty string is not considered missing
 229           * > for the purposes of the conditions above.
 230           */
 231          $system_identifier_is_missing = null === $system_identifier;
 232          $public_identifier            = null === $public_identifier ? '' : strtolower( $public_identifier );
 233          $system_identifier            = null === $system_identifier ? '' : strtolower( $system_identifier );
 234  
 235          /*
 236           * > The public identifier is set to…
 237           */
 238          if (
 239              '-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier ||
 240              '-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
 241              'html' === $public_identifier
 242          ) {
 243              $this->indicated_compatability_mode = 'quirks';
 244              return;
 245          }
 246  
 247          /*
 248           * > The system identifier is set to…
 249           */
 250          if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
 251              $this->indicated_compatability_mode = 'quirks';
 252              return;
 253          }
 254  
 255          /*
 256           * All of the following conditions depend on matching the public identifier.
 257           * If the public identifier is empty, none of the following conditions will match.
 258           */
 259          if ( '' === $public_identifier ) {
 260              $this->indicated_compatability_mode = 'no-quirks';
 261              return;
 262          }
 263  
 264          /*
 265           * > The public identifier starts with…
 266           *
 267           * @todo Optimize this matching. It shouldn't be a large overall performance issue,
 268           *       however, as only a single DOCTYPE declaration token should ever be parsed,
 269           *       and normative documents will have exited before reaching this condition.
 270           */
 271          if (
 272              str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) ||
 273              str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) ||
 274              str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) ||
 275              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) ||
 276              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) ||
 277              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) ||
 278              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) ||
 279              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) ||
 280              str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) ||
 281              str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) ||
 282              str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) ||
 283              str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) ||
 284              str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) ||
 285              str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) ||
 286              str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) ||
 287              str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) ||
 288              str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) ||
 289              str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) ||
 290              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) ||
 291              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) ||
 292              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) ||
 293              str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) ||
 294              str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) ||
 295              str_starts_with( $public_identifier, '-//ietf//dtd html//' ) ||
 296              str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) ||
 297              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) ||
 298              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) ||
 299              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) ||
 300              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) ||
 301              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) ||
 302              str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) ||
 303              str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) ||
 304              str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) ||
 305              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) ||
 306              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) ||
 307              str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) ||
 308              str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) ||
 309              str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) ||
 310              str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) ||
 311              str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) ||
 312              str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) ||
 313              str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) ||
 314              str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) ||
 315              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) ||
 316              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) ||
 317              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) ||
 318              str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) ||
 319              str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) ||
 320              str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) ||
 321              str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) ||
 322              str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) ||
 323              str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) ||
 324              str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) ||
 325              str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
 326              str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
 327          ) {
 328              $this->indicated_compatability_mode = 'quirks';
 329              return;
 330          }
 331  
 332          /*
 333           * > The system identifier is missing and the public identifier starts with…
 334           */
 335          if (
 336              $system_identifier_is_missing && (
 337                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
 338                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 339              )
 340          ) {
 341              $this->indicated_compatability_mode = 'quirks';
 342              return;
 343          }
 344  
 345          /*
 346           * > Otherwise, if the DOCTYPE token matches one of the conditions in
 347           * > the following list, then set the Document to limited-quirks mode.
 348           */
 349  
 350          /*
 351           * > The public identifier starts with…
 352           */
 353          if (
 354              str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
 355              str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
 356          ) {
 357              $this->indicated_compatability_mode = 'limited-quirks';
 358              return;
 359          }
 360  
 361          /*
 362           * > The system identifier is not missing and the public identifier starts with…
 363           */
 364          if (
 365              ! $system_identifier_is_missing && (
 366                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
 367                  str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 368              )
 369          ) {
 370              $this->indicated_compatability_mode = 'limited-quirks';
 371              return;
 372          }
 373  
 374          $this->indicated_compatability_mode = 'no-quirks';
 375      }
 376  
 377      /**
 378       * Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token.
 379       *
 380       * Use this method to parse a DOCTYPE declaration token and get access to its properties
 381       * via the returned WP_HTML_Doctype_Info class instance. The provided input must parse
 382       * properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE.
 383       *
 384       * Example:
 385       *
 386       *     // Normative HTML DOCTYPE declaration.
 387       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
 388       *     'no-quirks' === $doctype->indicated_compatability_mode;
 389       *
 390       *     // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
 391       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
 392       *     'quirks' === $doctype->indicated_compatability_mode;
 393       *
 394       *     // Textual quirks present in raw HTML are handled appropriately.
 395       *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
 396       *     'no-quirks' === $doctype->indicated_compatability_mode;
 397       *
 398       *     // Anything other than a proper DOCTYPE declaration token fails to parse.
 399       *     null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
 400       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE ><p>' );
 401       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!TYPEDOC>' );
 402       *     null === WP_HTML_Doctype_Info::from_doctype_token( 'html' );
 403       *     null === WP_HTML_Doctype_Info::from_doctype_token( '<?xml version="1.0" encoding="UTF-8" ?>' );
 404       *
 405       * @since 6.7.0
 406       *
 407       * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. `<!DOCTYPE html>`.
 408       *
 409       * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the
 410       *                                   provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null.
 411       */
 412  	public static function from_doctype_token( string $doctype_html ): ?self {
 413          $doctype_name      = null;
 414          $doctype_public_id = null;
 415          $doctype_system_id = null;
 416  
 417          $end = strlen( $doctype_html ) - 1;
 418  
 419          /*
 420           * This parser combines the rules for parsing DOCTYPE tokens found in the HTML
 421           * specification for the DOCTYPE related tokenizer states.
 422           *
 423           * @see https://html.spec.whatwg.org/#doctype-state
 424           */
 425  
 426          /*
 427           * - Valid DOCTYPE HTML token must be at least `<!DOCTYPE>` assuming a complete token not
 428           *   ending in end-of-file.
 429           * - It must start with an ASCII case-insensitive match for `<!DOCTYPE`.
 430           * - The only occurrence of `>` must be the final byte in the HTML string.
 431           */
 432          if (
 433              $end < 9 ||
 434              0 !== substr_compare( $doctype_html, '<!DOCTYPE', 0, 9, true )
 435          ) {
 436              return null;
 437          }
 438  
 439          $at = 9;
 440          // Is there one and only one `>`?
 441          if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) {
 442              return null;
 443          }
 444  
 445          /*
 446           * Perform newline normalization and ensure the $end value is correct after normalization.
 447           *
 448           * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
 449           * @see https://infra.spec.whatwg.org/#normalize-newlines
 450           */
 451          $doctype_html = str_replace( "\r\n", "\n", $doctype_html );
 452          $doctype_html = str_replace( "\r", "\n", $doctype_html );
 453          $end          = strlen( $doctype_html ) - 1;
 454  
 455          /*
 456           * In this state, the doctype token has been found and its "content" optionally including the
 457           * name, public identifier, and system identifier is between the current position and the end.
 458           *
 459           *     "<!DOCTYPE...declaration...>"
 460           *               ╰─ $at           ╰─ $end
 461           *
 462           * It's also possible that the declaration part is empty.
 463           *
 464           *               ╭─ $at
 465           *     "<!DOCTYPE>"
 466           *               ╰─ $end
 467           *
 468           * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they
 469           * have been handled above in the condition that the provided DOCTYPE HTML must contain
 470           * exactly one ">" character in the final position.
 471           */
 472  
 473          /*
 474           *
 475           * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and
 476           * proceed to the next state.
 477           *
 478           * @see https://html.spec.whatwg.org/#before-doctype-name-state
 479           */
 480          $at += strspn( $doctype_html, " \t\n\f\r", $at );
 481  
 482          if ( $at >= $end ) {
 483              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 484          }
 485  
 486          $name_length  = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 487          $doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) );
 488  
 489          $at += $name_length;
 490          $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 491          if ( $at >= $end ) {
 492              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 493          }
 494  
 495          /*
 496           * "After DOCTYPE name state"
 497           *
 498           * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point.
 499           * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype).
 500           *
 501           * @see https://html.spec.whatwg.org/#after-doctype-name-state
 502           */
 503          if ( $at + 6 >= $end ) {
 504              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 505          }
 506  
 507          /*
 508           * > If the six characters starting from the current input character are an ASCII
 509           * > case-insensitive match for the word "PUBLIC", then consume those characters
 510           * > and switch to the after DOCTYPE public keyword state.
 511           */
 512          if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) {
 513              $at += 6;
 514              $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 515              if ( $at >= $end ) {
 516                  return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 517              }
 518              goto parse_doctype_public_identifier;
 519          }
 520  
 521          /*
 522           * > Otherwise, if the six characters starting from the current input character are an ASCII
 523           * > case-insensitive match for the word "SYSTEM", then consume those characters and switch
 524           * > to the after DOCTYPE system keyword state.
 525           */
 526          if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) {
 527              $at += 6;
 528              $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 529              if ( $at >= $end ) {
 530                  return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 531              }
 532              goto parse_doctype_system_identifier;
 533          }
 534  
 535          /*
 536           * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error.
 537           * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus
 538           * > DOCTYPE state.
 539           */
 540          return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 541  
 542          parse_doctype_public_identifier:
 543          /*
 544           * The parser should enter "DOCTYPE public identifier (double-quoted) state" or
 545           * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes.
 546           * Anything else forces quirks mode and ignores the rest of the contents.
 547           *
 548           * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state
 549           * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state
 550           */
 551          $closer_quote = $doctype_html[ $at ];
 552  
 553          /*
 554           * > This is a missing-quote-before-doctype-public-identifier parse error. Set the
 555           * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
 556           */
 557          if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
 558              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 559          }
 560  
 561          ++$at;
 562  
 563          $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
 564          $doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
 565  
 566          $at += $identifier_length;
 567          if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
 568              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 569          }
 570  
 571          ++$at;
 572  
 573          /*
 574           * "Between DOCTYPE public and system identifiers state"
 575           *
 576           * Advance through whitespace between public and system identifiers.
 577           *
 578           * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state
 579           */
 580          $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
 581          if ( $at >= $end ) {
 582              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 583          }
 584  
 585          parse_doctype_system_identifier:
 586          /*
 587           * The parser should enter "DOCTYPE system identifier (double-quoted) state" or
 588           * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes.
 589           * Anything else forces quirks mode and ignores the rest of the contents.
 590           *
 591           * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state
 592           * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state
 593           */
 594          $closer_quote = $doctype_html[ $at ];
 595  
 596          /*
 597           * > This is a missing-quote-before-doctype-system-identifier parse error. Set the
 598           * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
 599           */
 600          if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
 601              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 602          }
 603  
 604          ++$at;
 605  
 606          $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
 607          $doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
 608  
 609          $at += $identifier_length;
 610          if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
 611              return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
 612          }
 613  
 614          return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
 615      }
 616  }


Generated : Tue Dec 24 08:20:01 2024 Cross-referenced by PHPXref