Make WordPress Core

Changeset 61467


Ignore:
Timestamp:
01/10/2026 09:45:09 PM (3 days ago)
Author:
dmsnell
Message:

HTML API: Refactor wp_kses_hair() for spec-compliance.

wp_kses_hair() is built around an impressive state machine for parsing the span of text following an HTML tag name and the tag’s closing > into a structured representation of the attributes. Unfortunately that parsing code doesn’t comply with the HTML Living Standard and is prone to mis-parsing attributes, particularly in the presence of malformed inputs.

This patch replaces the existing state machine with the spec-compliant parsing from the HTML API. With a comprehensive test suite covering attribute parsing, the same reliability the Tag Processor affords will be applied to wp_kses_hair(), giving new guarantees not previously available in Core:

  • All attribute values are reported fully-normalized, where character references are decoded and then re-encoded in a predictable manner. Only the “big five” syntax characters (“&<>'"”) will remain, and in their named forms.
  • All whole values are fully normalized and presented either as boolean attributes without a value, or with double-quoted attribute values.
  • All attributes and their values will be properly parsed according to how a browser would parse them, bringing agreement between the server and user agents.

Developed in https://github.com/WordPress/wordpress-develop/pull/9248
Discussed in https://core.trac.wordpress.org/ticket/63724

Props adamziel, dmsnell, jonsurrell, jorbin, westonruter.
Fixes #63724.

Location:
trunk
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/kses.php

    r61436 r61467  
    15861586
    15871587/**
    1588  * Builds an attribute list from string containing attributes.
    1589  *
    1590  * This function does a lot of work. It parses an attribute list into an array
    1591  * with attribute data, and tries to do the right thing even if it gets weird
    1592  * input. It will add quotes around attribute values that don't have any quotes
    1593  * or apostrophes around them, to make it easier to produce HTML code that will
    1594  * conform to W3C's HTML specification. It will also remove bad URL protocols
    1595  * from attribute values. It also reduces duplicate attributes by using the
    1596  * attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
     1588 * Given a string of HTML attributes and values, parse into a structured attribute list.
     1589 *
     1590 * This function performs a number of transformations while parsing attribute strings:
     1591 *  - It normalizes attribute values and surrounds them with double quotes.
     1592 *  - It normalizes HTML character references inside attribute values.
     1593 *  - It removes “bad” URL protocols from attribute values.
     1594 *
     1595 * Otherwise this reads the attributes as if they were part of an HTML tag. It performs
     1596 * these transformations to lower the risk of mis-parsing down the line and to perform
     1597 * URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does
     1598 * not decode the attribute values, meaning that special HTML syntax characters will
     1599 * be left with character references in the `value` property.
     1600 *
     1601 * Example:
     1602 *
     1603 *     $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'&lt;img&#00062\' =/🐮=/' );
     1604 *     $attrs === array(
     1605 *         'class'     => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ),
     1606 *         'inert'     => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ),
     1607 *         'data-lazy' => array( 'name' => 'data-lazy', 'value' => '&lt;img&gt;', 'whole' => 'data-lazy="&lt;img&gt;"', 'vless' => 'n' ),
     1608 *         '='         => array( 'name' => '=', 'value' => '', 'whole' => '=', 'vless' => 'y' ),
     1609 *         '🐮'        => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', 'vless' => 'n' ),
     1610 *     );
    15971611 *
    15981612 * @since 1.0.0
     1613 * @since 7.0.0 Reliably parses HTML via the HTML API.
    15991614 *
    16001615 * @param string   $attr              Attribute list from HTML element to closing HTML element tag.
    16011616 * @param string[] $allowed_protocols Array of allowed URL protocols.
    1602  * @return array[] Array of attribute information after parsing.
     1617 * @return array<string, array{name: string, value: string, whole: string, vless: 'y'|'n'}> Array of attribute information after parsing.
    16031618 */
    16041619function wp_kses_hair( $attr, $allowed_protocols ) {
    1605     $attrarr  = array();
    1606     $mode     = 0;
    1607     $attrname = '';
    1608     $uris     = wp_kses_uri_attributes();
    1609 
    1610     // Loop through the whole attribute list.
    1611 
    1612     while ( strlen( $attr ) !== 0 ) {
    1613         $working = 0; // Was the last operation successful?
    1614 
    1615         switch ( $mode ) {
    1616             case 0:
    1617                 if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
    1618                     $attrname = $match[1];
    1619                     $working  = 1;
    1620                     $mode     = 1;
    1621                     $attr     = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
    1622                 }
    1623 
    1624                 break;
    1625 
    1626             case 1:
    1627                 if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
    1628                     $working = 1;
    1629                     $mode    = 2;
    1630                     $attr    = preg_replace( '/^\s*=\s*/', '', $attr );
    1631                     break;
    1632                 }
    1633 
    1634                 if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
    1635                     $working = 1;
    1636                     $mode    = 0;
    1637 
    1638                     if ( false === array_key_exists( $attrname, $attrarr ) ) {
    1639                         $attrarr[ $attrname ] = array(
    1640                             'name'  => $attrname,
    1641                             'value' => '',
    1642                             'whole' => $attrname,
    1643                             'vless' => 'y',
    1644                         );
    1645                     }
    1646 
    1647                     $attr = preg_replace( '/^\s+/', '', $attr );
    1648                 }
    1649 
    1650                 break;
    1651 
    1652             case 2:
    1653                 if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
    1654                     // "value"
    1655                     $thisval = $match[1];
    1656                     if ( in_array( strtolower( $attrname ), $uris, true ) ) {
    1657                         $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
    1658                     }
    1659 
    1660                     if ( false === array_key_exists( $attrname, $attrarr ) ) {
    1661                         $attrarr[ $attrname ] = array(
    1662                             'name'  => $attrname,
    1663                             'value' => $thisval,
    1664                             'whole' => "$attrname=\"$thisval\"",
    1665                             'vless' => 'n',
    1666                         );
    1667                     }
    1668 
    1669                     $working = 1;
    1670                     $mode    = 0;
    1671                     $attr    = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
    1672                     break;
    1673                 }
    1674 
    1675                 if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
    1676                     // 'value'
    1677                     $thisval = $match[1];
    1678                     if ( in_array( strtolower( $attrname ), $uris, true ) ) {
    1679                         $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
    1680                     }
    1681 
    1682                     if ( false === array_key_exists( $attrname, $attrarr ) ) {
    1683                         $attrarr[ $attrname ] = array(
    1684                             'name'  => $attrname,
    1685                             'value' => $thisval,
    1686                             'whole' => "$attrname='$thisval'",
    1687                             'vless' => 'n',
    1688                         );
    1689                     }
    1690 
    1691                     $working = 1;
    1692                     $mode    = 0;
    1693                     $attr    = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
    1694                     break;
    1695                 }
    1696 
    1697                 if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
    1698                     // value
    1699                     $thisval = $match[1];
    1700                     if ( in_array( strtolower( $attrname ), $uris, true ) ) {
    1701                         $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
    1702                     }
    1703 
    1704                     if ( false === array_key_exists( $attrname, $attrarr ) ) {
    1705                         $attrarr[ $attrname ] = array(
    1706                             'name'  => $attrname,
    1707                             'value' => $thisval,
    1708                             'whole' => "$attrname=\"$thisval\"",
    1709                             'vless' => 'n',
    1710                         );
    1711                     }
    1712 
    1713                     // We add quotes to conform to W3C's HTML spec.
    1714                     $working = 1;
    1715                     $mode    = 0;
    1716                     $attr    = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
    1717                 }
    1718 
    1719                 break;
    1720         } // End switch.
    1721 
    1722         if ( 0 === $working ) { // Not well-formed, remove and try again.
    1723             $attr = wp_kses_html_error( $attr );
    1724             $mode = 0;
     1620    $attributes = array();
     1621    $uris       = wp_kses_uri_attributes();
     1622
     1623    $processor = new WP_HTML_Tag_Processor( "<wp {$attr}>" );
     1624    $processor->next_token();
     1625
     1626    $syntax_characters = array(
     1627        '&' => '&amp;',
     1628        '<' => '&lt;',
     1629        '>' => '&gt;',
     1630        "'" => '&apos;',
     1631        '"' => '&quot;',
     1632    );
     1633
     1634    foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) {
     1635        $value   = $processor->get_attribute( $name );
     1636        $is_bool = true === $value;
     1637        if ( is_string( $value ) && in_array( $name, $uris, true ) ) {
     1638            $value = wp_kses_bad_protocol( $value, $allowed_protocols );
    17251639        }
    1726     } // End while.
    1727 
    1728     if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
    1729         /*
    1730          * Special case, for when the attribute list ends with a valueless
    1731          * attribute like "selected".
    1732          */
    1733         $attrarr[ $attrname ] = array(
    1734             'name'  => $attrname,
    1735             'value' => '',
    1736             'whole' => $attrname,
    1737             'vless' => 'y',
     1640
     1641        // Reconstruct and normalize the attribute value.
     1642        $recoded = $is_bool ? '' : strtr( $value, $syntax_characters );
     1643        $whole   = $is_bool ? $name : "{$name}=\"{$recoded}\"";
     1644
     1645        $attributes[ $name ] = array(
     1646            'name'  => $name,
     1647            'value' => $recoded,
     1648            'whole' => $whole,
     1649            'vless' => $is_bool ? 'y' : 'n',
    17381650        );
    17391651    }
    17401652
    1741     return $attrarr;
     1653    return $attributes;
    17421654}
    17431655
  • trunk/tests/phpunit/tests/media.php

    r61416 r61467  
    228228            self::HTML_CONTENT,
    229229            $mark,
    230             'Test caption content should not contain the mark surround it: check test setup.'
     230            'Test caption content should not contain the mark surrounding it: check test setup.'
    231231        );
    232232
Note: See TracChangeset for help on using the changeset viewer.