[ Index ]

PHP Cross Reference of WordPress Trunk (Updated Daily)

Search

title

Body

[close]

/wp-includes/rest-api/endpoints/ -> class-wp-rest-url-details-controller.php (source)

   1  <?php
   2  /**
   3   * REST API: WP_REST_URL_Details_Controller class
   4   *
   5   * @package WordPress
   6   * @subpackage REST_API
   7   * @since 5.9.0
   8   */
   9  
  10  /**
  11   * Controller which provides REST endpoint for retrieving information
  12   * from a remote site's HTML response.
  13   *
  14   * @since 5.9.0
  15   *
  16   * @see WP_REST_Controller
  17   */
  18  class WP_REST_URL_Details_Controller extends WP_REST_Controller {
  19  
  20      /**
  21       * Constructs the controller.
  22       *
  23       * @since 5.9.0
  24       */
  25  	public function __construct() {
  26          $this->namespace = 'wp-block-editor/v1';
  27          $this->rest_base = 'url-details';
  28      }
  29  
  30      /**
  31       * Registers the necessary REST API routes.
  32       *
  33       * @since 5.9.0
  34       */
  35  	public function register_routes() {
  36          register_rest_route(
  37              $this->namespace,
  38              '/' . $this->rest_base,
  39              array(
  40                  array(
  41                      'methods'             => WP_REST_Server::READABLE,
  42                      'callback'            => array( $this, 'parse_url_details' ),
  43                      'args'                => array(
  44                          'url' => array(
  45                              'required'          => true,
  46                              'description'       => __( 'The URL to process.' ),
  47                              'validate_callback' => 'wp_http_validate_url',
  48                              'sanitize_callback' => 'sanitize_url',
  49                              'type'              => 'string',
  50                              'format'            => 'uri',
  51                          ),
  52                      ),
  53                      'permission_callback' => array( $this, 'permissions_check' ),
  54                      'schema'              => array( $this, 'get_public_item_schema' ),
  55                  ),
  56              )
  57          );
  58      }
  59  
  60      /**
  61       * Retrieves the item's schema, conforming to JSON Schema.
  62       *
  63       * @since 5.9.0
  64       *
  65       * @return array Item schema data.
  66       */
  67  	public function get_item_schema() {
  68          if ( $this->schema ) {
  69              return $this->add_additional_fields_schema( $this->schema );
  70          }
  71  
  72          $this->schema = array(
  73              '$schema'    => 'http://json-schema.org/draft-04/schema#',
  74              'title'      => 'url-details',
  75              'type'       => 'object',
  76              'properties' => array(
  77                  'title'       => array(
  78                      'description' => sprintf(
  79                          /* translators: %s: HTML title tag. */
  80                          __( 'The contents of the %s element from the URL.' ),
  81                          '<title>'
  82                      ),
  83                      'type'        => 'string',
  84                      'context'     => array( 'view', 'edit', 'embed' ),
  85                      'readonly'    => true,
  86                  ),
  87                  'icon'        => array(
  88                      'description' => sprintf(
  89                          /* translators: %s: HTML link tag. */
  90                          __( 'The favicon image link of the %s element from the URL.' ),
  91                          '<link rel="icon">'
  92                      ),
  93                      'type'        => 'string',
  94                      'format'      => 'uri',
  95                      'context'     => array( 'view', 'edit', 'embed' ),
  96                      'readonly'    => true,
  97                  ),
  98                  'description' => array(
  99                      'description' => sprintf(
 100                          /* translators: %s: HTML meta tag. */
 101                          __( 'The content of the %s element from the URL.' ),
 102                          '<meta name="description">'
 103                      ),
 104                      'type'        => 'string',
 105                      'context'     => array( 'view', 'edit', 'embed' ),
 106                      'readonly'    => true,
 107                  ),
 108                  'image'       => array(
 109                      'description' => sprintf(
 110                          /* translators: 1: HTML meta tag, 2: HTML meta tag. */
 111                          __( 'The Open Graph image link of the %1$s or %2$s element from the URL.' ),
 112                          '<meta property="og:image">',
 113                          '<meta property="og:image:url">'
 114                      ),
 115                      'type'        => 'string',
 116                      'format'      => 'uri',
 117                      'context'     => array( 'view', 'edit', 'embed' ),
 118                      'readonly'    => true,
 119                  ),
 120              ),
 121          );
 122  
 123          return $this->add_additional_fields_schema( $this->schema );
 124      }
 125  
 126      /**
 127       * Retrieves the contents of the title tag from the HTML response.
 128       *
 129       * @since 5.9.0
 130       *
 131       * @param WP_REST_Request $request Full details about the request.
 132       * @return WP_REST_Response|WP_Error The parsed details as a response object. WP_Error if there are errors.
 133       */
 134  	public function parse_url_details( $request ) {
 135          $url = untrailingslashit( $request['url'] );
 136  
 137          if ( empty( $url ) ) {
 138              return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) );
 139          }
 140  
 141          // Transient per URL.
 142          $cache_key = $this->build_cache_key_for_url( $url );
 143  
 144          // Attempt to retrieve cached response.
 145          $cached_response = $this->get_cache( $cache_key );
 146  
 147          if ( ! empty( $cached_response ) ) {
 148              $remote_url_response = $cached_response;
 149          } else {
 150              $remote_url_response = $this->get_remote_url( $url );
 151  
 152              // Exit if we don't have a valid body or it's empty.
 153              if ( is_wp_error( $remote_url_response ) || empty( $remote_url_response ) ) {
 154                  return $remote_url_response;
 155              }
 156  
 157              // Cache the valid response.
 158              $this->set_cache( $cache_key, $remote_url_response );
 159          }
 160  
 161          $html_head     = $this->get_document_head( $remote_url_response );
 162          $meta_elements = $this->get_meta_with_content_elements( $html_head );
 163  
 164          $data = $this->add_additional_fields_to_object(
 165              array(
 166                  'title'       => $this->get_title( $html_head ),
 167                  'icon'        => $this->get_icon( $html_head, $url ),
 168                  'description' => $this->get_description( $meta_elements ),
 169                  'image'       => $this->get_image( $meta_elements, $url ),
 170              ),
 171              $request
 172          );
 173  
 174          // Wrap the data in a response object.
 175          $response = rest_ensure_response( $data );
 176  
 177          /**
 178           * Filters the URL data for the response.
 179           *
 180           * @since 5.9.0
 181           *
 182           * @param WP_REST_Response $response            The response object.
 183           * @param string           $url                 The requested URL.
 184           * @param WP_REST_Request  $request             Request object.
 185           * @param string           $remote_url_response HTTP response body from the remote URL.
 186           */
 187          return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response );
 188      }
 189  
 190      /**
 191       * Checks whether a given request has permission to read remote URLs.
 192       *
 193       * @since 5.9.0
 194       *
 195       * @return true|WP_Error True if the request has permission, else WP_Error.
 196       */
 197  	public function permissions_check() {
 198          if ( current_user_can( 'edit_posts' ) ) {
 199              return true;
 200          }
 201  
 202          foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) {
 203              if ( current_user_can( $post_type->cap->edit_posts ) ) {
 204                  return true;
 205              }
 206          }
 207  
 208          return new WP_Error(
 209              'rest_cannot_view_url_details',
 210              __( 'Sorry, you are not allowed to process remote URLs.' ),
 211              array( 'status' => rest_authorization_required_code() )
 212          );
 213      }
 214  
 215      /**
 216       * Retrieves the document title from a remote URL.
 217       *
 218       * @since 5.9.0
 219       *
 220       * @param string $url The website URL whose HTML to access.
 221       * @return string|WP_Error The HTTP response from the remote URL on success.
 222       *                         WP_Error if no response or no content.
 223       */
 224  	private function get_remote_url( $url ) {
 225  
 226          /*
 227           * Provide a modified UA string to workaround web properties which block WordPress "Pingbacks".
 228           * Why? The UA string used for pingback requests contains `WordPress/` which is very similar
 229           * to that used as the default UA string by the WP HTTP API. Therefore requests from this
 230           * REST endpoint are being unintentionally blocked as they are misidentified as pingback requests.
 231           * By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP")
 232           * we are able to work around this issue.
 233           * Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`.
 234          */
 235          $modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')';
 236  
 237          $args = array(
 238              'limit_response_size' => 150 * KB_IN_BYTES,
 239              'user-agent'          => $modified_user_agent,
 240          );
 241  
 242          /**
 243           * Filters the HTTP request args for URL data retrieval.
 244           *
 245           * Can be used to adjust response size limit and other WP_Http::request() args.
 246           *
 247           * @since 5.9.0
 248           *
 249           * @param array  $args Arguments used for the HTTP request.
 250           * @param string $url  The attempted URL.
 251           */
 252          $args = apply_filters( 'rest_url_details_http_request_args', $args, $url );
 253  
 254          $response = wp_safe_remote_get( $url, $args );
 255  
 256          if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) {
 257              // Not saving the error response to cache since the error might be temporary.
 258              return new WP_Error(
 259                  'no_response',
 260                  __( 'URL not found. Response returned a non-200 status code for this URL.' ),
 261                  array( 'status' => WP_Http::NOT_FOUND )
 262              );
 263          }
 264  
 265          $remote_body = wp_remote_retrieve_body( $response );
 266  
 267          if ( empty( $remote_body ) ) {
 268              return new WP_Error(
 269                  'no_content',
 270                  __( 'Unable to retrieve body from response at this URL.' ),
 271                  array( 'status' => WP_Http::NOT_FOUND )
 272              );
 273          }
 274  
 275          return $remote_body;
 276      }
 277  
 278      /**
 279       * Parses the title tag contents from the provided HTML.
 280       *
 281       * @since 5.9.0
 282       *
 283       * @param string $html The HTML from the remote website at URL.
 284       * @return string The title tag contents on success. Empty string if not found.
 285       */
 286  	private function get_title( $html ) {
 287          $pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is';
 288          preg_match( $pattern, $html, $match_title );
 289  
 290          if ( empty( $match_title[1] ) || ! is_string( $match_title[1] ) ) {
 291              return '';
 292          }
 293  
 294          $title = trim( $match_title[1] );
 295  
 296          return $this->prepare_metadata_for_output( $title );
 297      }
 298  
 299      /**
 300       * Parses the site icon from the provided HTML.
 301       *
 302       * @since 5.9.0
 303       *
 304       * @param string $html The HTML from the remote website at URL.
 305       * @param string $url  The target website URL.
 306       * @return string The icon URI on success. Empty string if not found.
 307       */
 308  	private function get_icon( $html, $url ) {
 309          // Grab the icon's link element.
 310          $pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU';
 311          preg_match( $pattern, $html, $element );
 312          if ( empty( $element[0] ) || ! is_string( $element[0] ) ) {
 313              return '';
 314          }
 315          $element = trim( $element[0] );
 316  
 317          // Get the icon's href value.
 318          $pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU';
 319          preg_match( $pattern, $element, $icon );
 320          if ( empty( $icon[2] ) || ! is_string( $icon[2] ) ) {
 321              return '';
 322          }
 323          $icon = trim( $icon[2] );
 324  
 325          // If the icon is a data URL, return it.
 326          $parsed_icon = parse_url( $icon );
 327          if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) {
 328              return $icon;
 329          }
 330  
 331          // Attempt to convert relative URLs to absolute.
 332          if ( ! is_string( $url ) || '' === $url ) {
 333              return $icon;
 334          }
 335          $parsed_url = parse_url( $url );
 336          if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
 337              $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
 338              $icon     = WP_Http::make_absolute_url( $icon, $root_url );
 339          }
 340  
 341          return $icon;
 342      }
 343  
 344      /**
 345       * Parses the meta description from the provided HTML.
 346       *
 347       * @since 5.9.0
 348       *
 349       * @param array $meta_elements {
 350       *     A multidimensional indexed array on success, else empty array.
 351       *
 352       *     @type string[] $0 Meta elements with a content attribute.
 353       *     @type string[] $1 Content attribute's opening quotation mark.
 354       *     @type string[] $2 Content attribute's value for each meta element.
 355       * }
 356       * @return string The meta description contents on success. Empty string if not found.
 357       */
 358  	private function get_description( $meta_elements ) {
 359          // Bail out if there are no meta elements.
 360          if ( empty( $meta_elements[0] ) ) {
 361              return '';
 362          }
 363  
 364          $description = $this->get_metadata_from_meta_element(
 365              $meta_elements,
 366              'name',
 367              '(?:description|og:description)'
 368          );
 369  
 370          // Bail out if description not found.
 371          if ( '' === $description ) {
 372              return '';
 373          }
 374  
 375          return $this->prepare_metadata_for_output( $description );
 376      }
 377  
 378      /**
 379       * Parses the Open Graph (OG) Image from the provided HTML.
 380       *
 381       * See: https://ogp.me/.
 382       *
 383       * @since 5.9.0
 384       *
 385       * @param array  $meta_elements {
 386       *     A multidimensional indexed array on success, else empty array.
 387       *
 388       *     @type string[] $0 Meta elements with a content attribute.
 389       *     @type string[] $1 Content attribute's opening quotation mark.
 390       *     @type string[] $2 Content attribute's value for each meta element.
 391       * }
 392       * @param string $url The target website URL.
 393       * @return string The OG image on success. Empty string if not found.
 394       */
 395  	private function get_image( $meta_elements, $url ) {
 396          $image = $this->get_metadata_from_meta_element(
 397              $meta_elements,
 398              'property',
 399              '(?:og:image|og:image:url)'
 400          );
 401  
 402          // Bail out if image not found.
 403          if ( '' === $image ) {
 404              return '';
 405          }
 406  
 407          // Attempt to convert relative URLs to absolute.
 408          $parsed_url = parse_url( $url );
 409          if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
 410              $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
 411              $image    = WP_Http::make_absolute_url( $image, $root_url );
 412          }
 413  
 414          return $image;
 415      }
 416  
 417      /**
 418       * Prepares the metadata by:
 419       *    - stripping all HTML tags and tag entities.
 420       *    - converting non-tag entities into characters.
 421       *
 422       * @since 5.9.0
 423       *
 424       * @param string $metadata The metadata content to prepare.
 425       * @return string The prepared metadata.
 426       */
 427  	private function prepare_metadata_for_output( $metadata ) {
 428          $metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) );
 429          $metadata = wp_strip_all_tags( $metadata );
 430          return $metadata;
 431      }
 432  
 433      /**
 434       * Utility function to build cache key for a given URL.
 435       *
 436       * @since 5.9.0
 437       *
 438       * @param string $url The URL for which to build a cache key.
 439       * @return string The cache key.
 440       */
 441  	private function build_cache_key_for_url( $url ) {
 442          return 'g_url_details_response_' . md5( $url );
 443      }
 444  
 445      /**
 446       * Utility function to retrieve a value from the cache at a given key.
 447       *
 448       * @since 5.9.0
 449       *
 450       * @param string $key The cache key.
 451       * @return mixed The value from the cache.
 452       */
 453  	private function get_cache( $key ) {
 454          return get_site_transient( $key );
 455      }
 456  
 457      /**
 458       * Utility function to cache a given data set at a given cache key.
 459       *
 460       * @since 5.9.0
 461       *
 462       * @param string $key  The cache key under which to store the value.
 463       * @param string $data The data to be stored at the given cache key.
 464       * @return bool True when transient set. False if not set.
 465       */
 466  	private function set_cache( $key, $data = '' ) {
 467          $ttl = HOUR_IN_SECONDS;
 468  
 469          /**
 470           * Filters the cache expiration.
 471           *
 472           * Can be used to adjust the time until expiration in seconds for the cache
 473           * of the data retrieved for the given URL.
 474           *
 475           * @since 5.9.0
 476           *
 477           * @param int $ttl The time until cache expiration in seconds.
 478           */
 479          $cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl );
 480  
 481          return set_site_transient( $key, $data, $cache_expiration );
 482      }
 483  
 484      /**
 485       * Retrieves the head element section.
 486       *
 487       * @since 5.9.0
 488       *
 489       * @param string $html The string of HTML to parse.
 490       * @return string The `<head>..</head>` section on success. Given `$html` if not found.
 491       */
 492  	private function get_document_head( $html ) {
 493          $head_html = $html;
 494  
 495          // Find the opening `<head>` tag.
 496          $head_start = strpos( $html, '<head' );
 497          if ( false === $head_start ) {
 498              // Didn't find it. Return the original HTML.
 499              return $html;
 500          }
 501  
 502          // Find the closing `</head>` tag.
 503          $head_end = strpos( $head_html, '</head>' );
 504          if ( false === $head_end ) {
 505              // Didn't find it. Find the opening `<body>` tag.
 506              $head_end = strpos( $head_html, '<body' );
 507  
 508              // Didn't find it. Return the original HTML.
 509              if ( false === $head_end ) {
 510                  return $html;
 511              }
 512          }
 513  
 514          // Extract the HTML from opening tag to the closing tag. Then add the closing tag.
 515          $head_html  = substr( $head_html, $head_start, $head_end );
 516          $head_html .= '</head>';
 517  
 518          return $head_html;
 519      }
 520  
 521      /**
 522       * Gets all the meta tag elements that have a 'content' attribute.
 523       *
 524       * @since 5.9.0
 525       *
 526       * @param string $html The string of HTML to be parsed.
 527       * @return array {
 528       *     A multidimensional indexed array on success, else empty array.
 529       *
 530       *     @type string[] $0 Meta elements with a content attribute.
 531       *     @type string[] $1 Content attribute's opening quotation mark.
 532       *     @type string[] $2 Content attribute's value for each meta element.
 533       * }
 534       */
 535  	private function get_meta_with_content_elements( $html ) {
 536          /*
 537           * Parse all meta elements with a content attribute.
 538           *
 539           * Why first search for the content attribute rather than directly searching for name=description element?
 540           * tl;dr The content attribute's value will be truncated when it contains a > symbol.
 541           *
 542           * The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as
 543           * it's a string to the browser. Imagine what happens when attempting to match for the name=description
 544           * first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match
 545           * as the element's closing symbol. But wait, it's in the content attribute and is not the end of the
 546           * element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation".
 547           * If this happens, what gets matched is not the entire element or all of the content.
 548           *
 549           * Why not search for the name=description and then content="(.*)"?
 550           * The attribute order could be opposite. Plus, additional attributes may exist including being between
 551           * the name and content attributes.
 552           *
 553           * Why not lookahead?
 554           * Lookahead is not constrained to stay within the element. The first <meta it finds may not include
 555           * the name or content, but rather could be from a different element downstream.
 556           */
 557          $pattern = '#<meta\s' .
 558  
 559                  /*
 560                   * Allows for additional attributes before the content attribute.
 561                   * Searches for anything other than > symbol.
 562                   */
 563                  '[^>]*' .
 564  
 565                  /*
 566                  * Find the content attribute. When found, capture its value (.*).
 567                  *
 568                  * Allows for (a) single or double quotes and (b) whitespace in the value.
 569                  *
 570                  * Why capture the opening quotation mark, i.e. (["\']), and then backreference,
 571                  * i.e \1, for the closing quotation mark?
 572                  * To ensure the closing quotation mark matches the opening one. Why? Attribute values
 573                  * can contain quotation marks, such as an apostrophe in the content.
 574                  */
 575                  'content=(["\']??)(.*)\1' .
 576  
 577                  /*
 578                  * Allows for additional attributes after the content attribute.
 579                  * Searches for anything other than > symbol.
 580                  */
 581                  '[^>]*' .
 582  
 583                  /*
 584                  * \/?> searches for the closing > symbol, which can be in either /> or > format.
 585                  * # ends the pattern.
 586                  */
 587                  '\/?>#' .
 588  
 589                  /*
 590                  * These are the options:
 591                  * - i : case-insensitive
 592                  * - s : allows newline characters for the . match (needed for multiline elements)
 593                  * - U means non-greedy matching
 594                  */
 595                  'isU';
 596  
 597          preg_match_all( $pattern, $html, $elements );
 598  
 599          return $elements;
 600      }
 601  
 602      /**
 603       * Gets the metadata from a target meta element.
 604       *
 605       * @since 5.9.0
 606       *
 607       * @param array  $meta_elements {
 608       *     A multi-dimensional indexed array on success, else empty array.
 609       *
 610       *     @type string[] $0 Meta elements with a content attribute.
 611       *     @type string[] $1 Content attribute's opening quotation mark.
 612       *     @type string[] $2 Content attribute's value for each meta element.
 613       * }
 614       * @param string $attr       Attribute that identifies the element with the target metadata.
 615       * @param string $attr_value The attribute's value that identifies the element with the target metadata.
 616       * @return string The metadata on success. Empty string if not found.
 617       */
 618  	private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) {
 619          // Bail out if there are no meta elements.
 620          if ( empty( $meta_elements[0] ) ) {
 621              return '';
 622          }
 623  
 624          $metadata = '';
 625          $pattern  = '#' .
 626                  /*
 627                   * Target this attribute and value to find the metadata element.
 628                   *
 629                   * Allows for (a) no, single, double quotes and (b) whitespace in the value.
 630                   *
 631                   * Why capture the opening quotation mark, i.e. (["\']), and then backreference,
 632                   * i.e \1, for the closing quotation mark?
 633                   * To ensure the closing quotation mark matches the opening one. Why? Attribute values
 634                   * can contain quotation marks, such as an apostrophe in the content.
 635                   */
 636                  $attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' .
 637  
 638                  /*
 639                   * These are the options:
 640                   * - i : case-insensitive
 641                   * - s : allows newline characters for the . match (needed for multiline elements)
 642                   * - U means non-greedy matching
 643                   */
 644                  '#isU';
 645  
 646          // Find the metadata element.
 647          foreach ( $meta_elements[0] as $index => $element ) {
 648              preg_match( $pattern, $element, $match );
 649  
 650              // This is not the metadata element. Skip it.
 651              if ( empty( $match ) ) {
 652                  continue;
 653              }
 654  
 655              /*
 656               * Found the metadata element.
 657               * Get the metadata from its matching content array.
 658               */
 659              if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) {
 660                  $metadata = trim( $meta_elements[2][ $index ] );
 661              }
 662  
 663              break;
 664          }
 665  
 666          return $metadata;
 667      }
 668  }


Generated : Sat Dec 21 08:20:01 2024 Cross-referenced by PHPXref