diff --git a/.github/workflows/coding-standards.yml b/.github/workflows/coding-standards.yml deleted file mode 100644 index 2cdfa1d..0000000 --- a/.github/workflows/coding-standards.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: "CS" - -on: - pull_request: - push: - branches: - - master - -jobs: - coding-standards: - name: "CS Fixer" - runs-on: "ubuntu-20.04" - - steps: - - name: "Checkout" - uses: "actions/checkout@v2" - - - name: "Install PHP" - uses: "shivammathur/setup-php@v2" - with: - coverage: "none" - php-version: "7.4" - tools: cs2pr, pecl, composer:v2 - extensions: tidy - ini-values: "date.timezone=Europe/Paris" - env: - COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v2" - with: - composer-options: "--optimize-autoloader --prefer-dist" - - - name: "RUN PHP CS Fixer" - run: "php vendor/bin/php-cs-fixer fix --verbose --dry-run --format=checkstyle | cs2pr" diff --git a/htmLawed.php b/htmLawed.php index bedad6e..6676cff 100755 --- a/htmLawed.php +++ b/htmLawed.php @@ -1,1165 +1,1593 @@ + * @copyright (c) 2007-, Santosh Patnaik + * @dependency None + * @license LGPL 3 and GPL 2+ dual license + * @link https://bioinformatics.org/phplabware/internal_utilities/htmLawed + * @package htmLawed + * @php >=4.4 + * @time 2023-01-23 + * @version 1.2.11 */ -function htmLawed($t, $C = 1, $S = []) +/* + * Main function. + * Calls all other functions (alphabetically ordered further below). + * + * @param string $t HTM. + * @param mixed $C $config configuration option. + * @param mixed $S $spec specification option. + * @return string Filtered/sanitized $t. + */ +function htmLawed($t, $C=1, $S=array()) { - $C = is_array($C) ? $C : []; - if (!empty($C['valid_xhtml'])) { - $C['elements'] = empty($C['elements']) ? '*-acronym-big-center-dir-font-isindex-s-strike-tt' : $C['elements']; - $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 2; - $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 2; - } - // config eles - $e = ['a' => 1, 'abbr' => 1, 'acronym' => 1, 'address' => 1, 'applet' => 1, 'area' => 1, 'article' => 1, 'aside' => 1, 'audio' => 1, 'b' => 1, 'bdi' => 1, 'bdo' => 1, 'big' => 1, 'blockquote' => 1, 'br' => 1, 'button' => 1, 'canvas' => 1, 'caption' => 1, 'center' => 1, 'cite' => 1, 'code' => 1, 'col' => 1, 'colgroup' => 1, 'command' => 1, 'data' => 1, 'datalist' => 1, 'dd' => 1, 'del' => 1, 'details' => 1, 'dialog' => 1, 'dfn' => 1, 'dir' => 1, 'div' => 1, 'dl' => 1, 'dt' => 1, 'em' => 1, 'embed' => 1, 'fieldset' => 1, 'figcaption' => 1, 'figure' => 1, 'font' => 1, 'footer' => 1, 'form' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'header' => 1, 'hgroup' => 1, 'hr' => 1, 'i' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'ins' => 1, 'isindex' => 1, 'kbd' => 1, 'keygen' => 1, 'label' => 1, 'legend' => 1, 'li' => 1, 'link' => 1, 'main' => 1, 'map' => 1, 'mark' => 1, 'menu' => 1, 'meta' => 1, 'meter' => 1, 'nav' => 1, 'noscript' => 1, 'object' => 1, 'ol' => 1, 'optgroup' => 1, 'option' => 1, 'output' => 1, 'p' => 1, 'param' => 1, 'picture' => 1, 'pre' => 1, 'progress' => 1, 'q' => 1, 'rb' => 1, 'rbc' => 1, 'rp' => 1, 'rt' => 1, 'rtc' => 1, 'ruby' => 1, 's' => 1, 'samp' => 1, 'script' => 1, 'section' => 1, 'select' => 1, 'slot' => 1, 'small' => 1, 'source' => 1, 'span' => 1, 'strike' => 1, 'strong' => 1, 'style' => 1, 'sub' => 1, 'summary' => 1, 'sup' => 1, 'table' => 1, 'tbody' => 1, 'td' => 1, 'template' => 1, 'textarea' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'time' => 1, 'tr' => 1, 'track' => 1, 'tt' => 1, 'u' => 1, 'ul' => 1, 'var' => 1, 'video' => 1, 'wbr' => 1]; // 122 incl. deprecated & some Ruby + // Standard elements including deprecated. - if (!empty($C['safe'])) { - unset($e['applet'], $e['audio'], $e['canvas'], $e['dialog'], $e['embed'], $e['iframe'], $e['object'], $e['script'], $e['video']); - } - $x = !empty($C['elements']) ? str_replace(["\n", "\r", "\t", ' '], '', strtolower($C['elements'])) : '*'; - if ('-*' === $x) { - $e = []; - } elseif (false === strpos($x, '*')) { - $e = array_flip(explode(',', $x)); - } else { - if (isset($x[1])) { - if (strpos($x, '(')) { - $x = preg_replace_callback('`\([^()]+\)`', function ($m) {return str_replace(['(', ')', '-'], ['', '', 'A'], $m[0]); }, $x); - } + $eleAr = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'dd'=>1, 'del'=>1, 'details'=>1, 'dialog'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'figcaption'=>1, 'figure'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'keygen'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'link'=>1, 'main'=>1, 'map'=>1, 'mark'=>1, 'menu'=>1, 'meta'=>1, 'meter'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'output'=>1, 'p'=>1, 'param'=>1, 'picture'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'slot'=>1, 'small'=>1, 'source'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'style'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'template'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'time'=>1, 'tr'=>1, 'track'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); - preg_match_all('`(?:^|-|\+)[^\-+]+?(?=-|\+|$)`', $x, $m, \PREG_SET_ORDER); - for ($i = count($m); --$i >= 0;) { - $m[$i] = $m[$i][0]; - } - foreach ($m as $v) { - $v = str_replace('A', '-', $v); - if ('+' === $v[0]) { - $e[substr($v, 1)] = 1; - } elseif ('-' === $v[0]) { - if (strpos($v, '-', 1)) { - $e[$v] = 1; - } elseif (isset($e[($v = substr($v, 1))]) && !in_array('+' . $v, $m, true)) { - unset($e[$v]); - } - } - } - } - } - $C['elements'] = &$e; - // config attrs - $x = !empty($C['deny_attribute']) ? strtolower(preg_replace('"\s+-"', '/', trim($C['deny_attribute']))) : ''; - $x = array_flip((isset($x[0]) && '*' === $x[0]) ? explode('/', $x) : explode(',', $x . (!empty($C['safe']) ? ',on*' : ''))); - $C['deny_attribute'] = $x; - // config URLs - $x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet' . (empty($C['safe']) ? ', app, javascript; *: data, javascript, ' : '; *:') . 'file, http, https'; - $C['schemes'] = []; - foreach (explode(';', trim(str_replace([' ', "\t", "\r", "\n"], '', $x), ';')) as $v) { - $x = $x2 = null; - list($x, $x2) = explode(':', $v, 2); - if ($x2) { - $C['schemes'][$x] = array_flip(explode(',', $x2)); - } - } - if (!isset($C['schemes']['*'])) { - $C['schemes']['*'] = ['file' => 1, 'http' => 1, 'https' => 1]; - if (empty($C['safe'])) { - $C['schemes']['*'] += ['data' => 1, 'javascript' => 1]; - } - } - if (!empty($C['safe']) && empty($C['schemes']['style'])) { - $C['schemes']['style'] = ['!' => 1]; - } - $C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0; - if (!isset($C['base_url']) || !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])) { - $C['base_url'] = $C['abs_url'] = 0; - } - // config rest - $C['and_mark'] = empty($C['and_mark']) ? 0 : 1; - $C['anti_link_spam'] = (isset($C['anti_link_spam']) && is_array($C['anti_link_spam']) && 2 === count($C['anti_link_spam']) && (empty($C['anti_link_spam'][0]) || hl_regex($C['anti_link_spam'][0])) && (empty($C['anti_link_spam'][1]) || hl_regex($C['anti_link_spam'][1]))) ? $C['anti_link_spam'] : 0; - $C['anti_mail_spam'] = isset($C['anti_mail_spam']) ? $C['anti_mail_spam'] : 0; - $C['any_custom_element'] = (!isset($C['any_custom_element']) || !empty($C['any_custom_element'])) ? 1 : 0; - $C['balance'] = isset($C['balance']) ? (bool) $C['balance'] : 1; - $C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0); - $C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char']; - $C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0); - $C['css_expression'] = empty($C['css_expression']) ? 0 : 1; - $C['direct_list_nest'] = empty($C['direct_list_nest']) ? 0 : 1; - $C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1; - $C['hook'] = (!empty($C['hook']) && function_exists($C['hook'])) ? $C['hook'] : 0; - $C['hook_tag'] = (!empty($C['hook_tag']) && function_exists($C['hook_tag'])) ? $C['hook_tag'] : 0; - $C['keep_bad'] = isset($C['keep_bad']) ? $C['keep_bad'] : 6; - $C['lc_std_val'] = isset($C['lc_std_val']) ? (bool) $C['lc_std_val'] : 1; - $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 1; - $C['named_entity'] = isset($C['named_entity']) ? (bool) $C['named_entity'] : 1; - $C['no_deprecated_attr'] = isset($C['no_deprecated_attr']) ? $C['no_deprecated_attr'] : 1; - $C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body'; - $C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0; - $C['style_pass'] = empty($C['style_pass']) ? 0 : 1; - $C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy']; - $C['unique_ids'] = isset($C['unique_ids']) && (!preg_match('`\W`', $C['unique_ids'])) ? $C['unique_ids'] : 1; - $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0; - - if (isset($GLOBALS['C'])) { - $reC = $GLOBALS['C']; - } - $GLOBALS['C'] = $C; - $S = is_array($S) ? $S : hl_spec($S); - if (isset($GLOBALS['S'])) { - $reS = $GLOBALS['S']; - } - $GLOBALS['S'] = $S; + // Set $C array ($config), using default parameters as needed. - $t = preg_replace('`[\x00-\x08\x0b-\x0c\x0e-\x1f]`', '', $t); - if ($C['clean_ms_char']) { - $x = ["\x7f" => '', "\x80" => '€', "\x81" => '', "\x83" => 'ƒ', "\x85" => '…', "\x86" => '†', "\x87" => '‡', "\x88" => 'ˆ', "\x89" => '‰', "\x8a" => 'Š', "\x8b" => '‹', "\x8c" => 'Œ', "\x8d" => '', "\x8e" => 'Ž', "\x8f" => '', "\x90" => '', "\x95" => '•', "\x96" => '–', "\x97" => '—', "\x98" => '˜', "\x99" => '™', "\x9a" => 'š', "\x9b" => '›', "\x9c" => 'œ', "\x9d" => '', "\x9e" => 'ž', "\x9f" => 'Ÿ']; - $x = $x + (1 === $C['clean_ms_char'] ? ["\x82" => '‚', "\x84" => '„', "\x91" => '‘', "\x92" => '’', "\x93" => '“', "\x94" => '”'] : ["\x82" => '\'', "\x84" => '"', "\x91" => '\'', "\x92" => '\'', "\x93" => '"', "\x94" => '"']); - $t = strtr($t, $x); - } - if ($C['cdata'] || $C['comment']) { - $t = preg_replace_callback('``sm', 'hl_cmtcd', $t); - } - $t = preg_replace_callback('`&([a-zA-Z][a-zA-Z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'hl_ent', str_replace('&', '&', $t)); - if ($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])) { - $GLOBALS['hl_Ids'] = []; - } - if ($C['hook']) { - $t = $C['hook']($t, $C, $S); - } - if ($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])) { - $GLOBALS[$C['show_setting']] = ['config' => $C, 'spec' => $S, 'time' => microtime()]; - } - // main - $t = preg_replace_callback('`<(?:(?:\s|$)|(?:[^>]*(?:>|$)))|>`m', 'hl_tag', $t); - $t = $C['balance'] ? hl_bal($t, $C['keep_bad'], $C['parent']) : $t; - $t = (($C['cdata'] || $C['comment']) && false !== strpos($t, "\x01")) ? str_replace(["\x01", "\x02", "\x03", "\x04", "\x05"], ['', '', '&', '<', '>'], $t) : $t; - $t = $C['tidy'] ? hl_tidy($t, $C['tidy'], $C['parent']) : $t; - unset($C, $e); - if (isset($reC)) { - $GLOBALS['C'] = $reC; - } - if (isset($reS)) { - $GLOBALS['S'] = $reS; - } + $C = is_array($C) ? $C : array(); + if (!empty($C['valid_xhtml'])) { + $C['elements'] = empty($C['elements']) ? '*-acronym-big-center-dir-font-isindex-s-strike-tt' : $C['elements']; + $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 2; + $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 2; + } - return $t; -} + // -- Configure for elements. -function hl_attrval($a, $t, $p) -{ - // check attr val against $S - static $ma = ['accesskey', 'class', 'itemtype', 'rel']; - $s = in_array($a, $ma, true) ? ' ' : ('srcset' === $a ? ',' : ''); - $r = []; - $t = !empty($s) ? explode($s, $t) : [$t]; - foreach ($t as $tk => $tv) { - $o = 1; - $tv = trim($tv); - $l = strlen($tv); - foreach ($p as $k => $v) { - if (!$l) { - continue; - } - switch ($k) { - case 'maxlen': - if ($l > $v) { - $o = 0; - } - break; - case 'minlen': - if ($l < $v) { - $o = 0; - } - break; - case 'maxval': - if ((float) ($tv) > $v) { - $o = 0; - } - break; - case 'minval': - if ((float) ($tv) < $v) { - $o = 0; - } - break; - case 'match': - if (!preg_match($v, $tv)) { - $o = 0; - } - break; - case 'nomatch': - if (preg_match($v, $tv)) { - $o = 0; - } - break; - case 'oneof': - $m = 0; - foreach (explode('|', $v) as $n) { - if ($tv === $n) { - $m = 1; - break; - } - } - $o = $m; - break; - case 'noneof': - $m = 1; - foreach (explode('|', $v) as $n) { - if ($tv === $n) { - $m = 0; - break; - } - } - $o = $m; - break; - default: - break; - } - if (!$o) { - break; - } - } - if ($o) { - $r[] = $tv; + if (!empty($C['safe'])) { + unset($eleAr['applet'], $eleAr['audio'], $eleAr['canvas'], $eleAr['dialog'], $eleAr['embed'], $eleAr['iframe'], $eleAr['object'], $eleAr['script'], $eleAr['video']); + } + $x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', strtolower($C['elements'])) : '*'; + if ($x == '-*') { + $eleAr = array(); + } elseif (strpos($x, '*') === false) { + $eleAr = array_flip(explode(',', $x)); + } else { + if (isset($x[1])) { + if (strpos($x, '(')) { // Temporarily replace hyphen of custom element, minus being special character + $x = + preg_replace_callback( + '`\([^()]+\)`', + function ($m) { + return str_replace(array('(', ')', '-'), array('', '', 'A'), $m[0]); + }, + $x); + } + preg_match_all('`(?:^|-|\+)[^\-+]+?(?=-|\+|$)`', $x, $m, PREG_SET_ORDER); + for ($i=count($m); --$i>=0;) { + $m[$i] = $m[$i][0]; + } + foreach ($m as $v) { + $v = str_replace('A', '-', $v); + if ($v[0] == '+') { + $eleAr[substr($v, 1)] = 1; + } elseif ($v[0] == '-') { + if (strpos($v, '-', 1)) { + $eleAr[$v] = 1; + } elseif (isset($eleAr[($v = substr($v, 1))]) && !in_array('+'. $v, $m)) { + unset($eleAr[$v]); + } } + } } - if (',' === $s) { - $s = ', '; - } - $r = implode($s, $r); + } + $C['elements'] =& $eleAr; - return isset($r[0]) ? $r : (isset($p['default']) ? $p['default'] : 0); -} + // -- Configure for attributes. -function hl_bal($t, $do = 1, $in = 'div') -{ - // balance tags - // by content - $cB = ['blockquote' => 1, 'form' => 1, 'map' => 1, 'noscript' => 1]; // Block - $cE = ['area' => 1, 'br' => 1, 'col' => 1, 'command' => 1, 'embed' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'isindex' => 1, 'keygen' => 1, 'link' => 1, 'meta' => 1, 'param' => 1, 'source' => 1, 'track' => 1, 'wbr' => 1]; // Empty - $cF = ['a' => 1, 'article' => 1, 'aside' => 1, 'audio' => 1, 'button' => 1, 'canvas' => 1, 'del' => 1, 'details' => 1, 'dialog' => 1, 'div' => 1, 'dd' => 1, 'fieldset' => 1, 'figure' => 1, 'footer' => 1, 'header' => 1, 'iframe' => 1, 'ins' => 1, 'li' => 1, 'main' => 1, 'menu' => 1, 'nav' => 1, 'noscript' => 1, 'object' => 1, 'section' => 1, 'slot' => 1, 'style' => 1, 'td' => 1, 'template' => 1, 'th' => 1, 'video' => 1]; // Flow; later context-wise dynamic move of ins & del to $cI - $cI = ['abbr' => 1, 'acronym' => 1, 'address' => 1, 'b' => 1, 'bdi' => 1, 'bdo' => 1, 'big' => 1, 'caption' => 1, 'cite' => 1, 'code' => 1, 'data' => 1, 'datalist' => 1, 'dfn' => 1, 'dt' => 1, 'em' => 1, 'figcaption' => 1, 'font' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'hgroup' => 1, 'i' => 1, 'kbd' => 1, 'label' => 1, 'legend' => 1, 'mark' => 1, 'meter' => 1, 'output' => 1, 'p' => 1, 'picture' => 1, 'pre' => 1, 'progress' => 1, 'q' => 1, 'rb' => 1, 'rt' => 1, 's' => 1, 'samp' => 1, 'small' => 1, 'span' => 1, 'strike' => 1, 'strong' => 1, 'sub' => 1, 'summary' => 1, 'sup' => 1, 'time' => 1, 'tt' => 1, 'u' => 1, 'var' => 1]; // Inline - $cN = ['a' => ['a' => 1, 'address' => 1, 'button' => 1, 'details' => 1, 'embed' => 1, 'keygen' => 1, 'label' => 1, 'select' => 1, 'textarea' => 1], 'address' => ['address' => 1, 'article' => 1, 'aside' => 1, 'header' => 1, 'keygen' => 1, 'footer' => 1, 'nav' => 1, 'section' => 1], 'button' => ['a' => 1, 'address' => 1, 'button' => 1, 'details' => 1, 'embed' => 1, 'fieldset' => 1, 'form' => 1, 'iframe' => 1, 'input' => 1, 'keygen' => 1, 'label' => 1, 'select' => 1, 'textarea' => 1], 'fieldset' => ['fieldset' => 1], 'footer' => ['header' => 1, 'footer' => 1], 'form' => ['form' => 1], 'header' => ['header' => 1, 'footer' => 1], 'label' => ['label' => 1], 'main' => ['main' => 1], 'meter' => ['meter' => 1], 'noscript' => ['script' => 1], 'pre' => ['big' => 1, 'font' => 1, 'img' => 1, 'object' => 1, 'script' => 1, 'small' => 1, 'sub' => 1, 'sup' => 1], 'progress' => ['progress' => 1], 'rb' => ['ruby' => 1], 'rt' => ['ruby' => 1], 'time' => ['time' => 1]]; // Illegal - $cN2 = array_keys($cN); - $cS = ['colgroup' => ['col' => 1], 'datalist' => ['option' => 1], 'dir' => ['li' => 1], 'dl' => ['dd' => 1, 'dt' => 1], 'hgroup' => ['h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1], 'menu' => ['li' => 1], 'ol' => ['li' => 1], 'optgroup' => ['option' => 1], 'option' => ['#pcdata' => 1], 'rbc' => ['rb' => 1], 'rp' => ['#pcdata' => 1], 'rtc' => ['rt' => 1], 'ruby' => ['rb' => 1, 'rbc' => 1, 'rp' => 1, 'rt' => 1, 'rtc' => 1, '#pcdata' => 1], 'select' => ['optgroup' => 1, 'option' => 1], 'script' => ['#pcdata' => 1], 'table' => ['caption' => 1, 'col' => 1, 'colgroup' => 1, 'tfoot' => 1, 'tbody' => 1, 'tr' => 1, 'thead' => 1], 'tbody' => ['tr' => 1], 'tfoot' => ['tr' => 1], 'textarea' => ['#pcdata' => 1], 'thead' => ['tr' => 1], 'tr' => ['td' => 1, 'th' => 1], 'ul' => ['li' => 1]]; // Specific - immediate parent-child - if ($GLOBALS['C']['direct_list_nest']) { - $cS['ol'] = $cS['ul'] = $cS['menu'] += ['menu' => 1, 'ol' => 1, 'ul' => 1]; - } - $cO = ['address' => ['p' => 1], 'applet' => ['param' => 1], 'audio' => ['source' => 1, 'track' => 1], 'blockquote' => ['script' => 1], 'details' => ['summary' => 1], 'fieldset' => ['legend' => 1, '#pcdata' => 1], 'figure' => ['figcaption' => 1], 'form' => ['script' => 1], 'map' => ['area' => 1], 'object' => ['param' => 1, 'embed' => 1], 'video' => ['source' => 1, 'track' => 1]]; // Other - $cT = ['colgroup' => 1, 'dd' => 1, 'dt' => 1, 'li' => 1, 'option' => 1, 'p' => 1, 'td' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'tr' => 1]; // Omitable closing - // block/inline type; a/ins/del both type; #pcdata: text - $eB = ['a' => 1, 'address' => 1, 'article' => 1, 'aside' => 1, 'blockquote' => 1, 'center' => 1, 'del' => 1, 'details' => 1, 'dialog' => 1, 'dir' => 1, 'dl' => 1, 'div' => 1, 'fieldset' => 1, 'figure' => 1, 'footer' => 1, 'form' => 1, 'ins' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'header' => 1, 'hr' => 1, 'isindex' => 1, 'main' => 1, 'menu' => 1, 'nav' => 1, 'noscript' => 1, 'ol' => 1, 'p' => 1, 'pre' => 1, 'section' => 1, 'slot' => 1, 'style' => 1, 'table' => 1, 'template' => 1, 'ul' => 1]; - $eI = ['#pcdata' => 1, 'a' => 1, 'abbr' => 1, 'acronym' => 1, 'applet' => 1, 'audio' => 1, 'b' => 1, 'bdi' => 1, 'bdo' => 1, 'big' => 1, 'br' => 1, 'button' => 1, 'canvas' => 1, 'cite' => 1, 'code' => 1, 'command' => 1, 'data' => 1, 'datalist' => 1, 'del' => 1, 'dfn' => 1, 'em' => 1, 'embed' => 1, 'figcaption' => 1, 'font' => 1, 'i' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'ins' => 1, 'kbd' => 1, 'label' => 1, 'link' => 1, 'map' => 1, 'mark' => 1, 'meta' => 1, 'meter' => 1, 'object' => 1, 'output' => 1, 'picture' => 1, 'progress' => 1, 'q' => 1, 'ruby' => 1, 's' => 1, 'samp' => 1, 'select' => 1, 'script' => 1, 'small' => 1, 'span' => 1, 'strike' => 1, 'strong' => 1, 'sub' => 1, 'summary' => 1, 'sup' => 1, 'textarea' => 1, 'time' => 1, 'tt' => 1, 'u' => 1, 'var' => 1, 'video' => 1, 'wbr' => 1]; - $eN = ['a' => 1, 'address' => 1, 'article' => 1, 'aside' => 1, 'big' => 1, 'button' => 1, 'details' => 1, 'embed' => 1, 'fieldset' => 1, 'font' => 1, 'footer' => 1, 'form' => 1, 'header' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'keygen' => 1, 'label' => 1, 'meter' => 1, 'nav' => 1, 'object' => 1, 'progress' => 1, 'ruby' => 1, 'script' => 1, 'select' => 1, 'small' => 1, 'sub' => 1, 'sup' => 1, 'textarea' => 1, 'time' => 1]; // Exclude from specific ele; $cN values - $eO = ['area' => 1, 'caption' => 1, 'col' => 1, 'colgroup' => 1, 'command' => 1, 'dd' => 1, 'dt' => 1, 'hgroup' => 1, 'keygen' => 1, 'legend' => 1, 'li' => 1, 'optgroup' => 1, 'option' => 1, 'param' => 1, 'rb' => 1, 'rbc' => 1, 'rp' => 1, 'rt' => 1, 'rtc' => 1, 'script' => 1, 'source' => 1, 'tbody' => 1, 'td' => 1, 'tfoot' => 1, 'thead' => 1, 'th' => 1, 'tr' => 1, 'track' => 1]; // Missing in $eB & $eI - $eF = $eB + $eI; - - // $in sets allowed child - $in = ((isset($eF[$in]) && '#pcdata' !== $in) || isset($eO[$in])) ? $in : 'div'; - if (isset($cE[$in])) { - return !$do ? '' : str_replace(['<', '>'], ['<', '>'], $t); - } - if (isset($cS[$in])) { - $inOk = $cS[$in]; - } elseif (isset($cI[$in])) { - $inOk = $eI; - $cI['del'] = 1; - $cI['ins'] = 1; - } elseif (isset($cF[$in])) { - $inOk = $eF; - unset($cI['del'], $cI['ins']); - } elseif (isset($cB[$in])) { - $inOk = $eB; - unset($cI['del'], $cI['ins']); - } - if (isset($cO[$in])) { - $inOk = $inOk + $cO[$in]; - } - if (isset($cN[$in])) { - $inOk = array_diff_assoc($inOk, $cN[$in]); - } - if (strpos($in, '-')) { - $inOk = ['*' => 1, '#pcdata' => 1]; - } // custom ele + $x = !empty($C['deny_attribute']) ? strtolower(preg_replace('"\s+-"', '/', trim($C['deny_attribute']))) : ''; + $x = str_replace(array(' ', "\t", "\r", "\n"), '', $x); + $x = + array_flip( + (isset($x[0]) && $x[0] == '*') + ? preg_replace( + '`^[^*]`', + '-'. '\\0', + explode( + '/', + (!empty($C['safe']) ? preg_replace('`/on[^/]+`', '', $x) : $x))) + : array_filter(explode(',', $x. (!empty($C['safe']) ? ',on*' : '')))); + $C['deny_attribute'] = $x; - $t = explode('<', $t); - $ok = $q = []; // $q seq list of open non-empty ele - ob_start(); + // -- Configure URL handling. - for ($i = -1, $ci = count($t); ++$i < $ci;) { - // allowed $ok in parent $p - if ($ql = count($q)) { - $p = array_pop($q); - $q[] = $p; - if (isset($cS[$p])) { - $ok = $cS[$p]; - } elseif (isset($cI[$p])) { - $ok = $eI; - $cI['del'] = 1; - $cI['ins'] = 1; - } elseif (isset($cF[$p])) { - $ok = $eF; - unset($cI['del'], $cI['ins']); - } elseif (isset($cB[$p])) { - $ok = $eB; - unset($cI['del'], $cI['ins']); - } - if (isset($cO[$p])) { - $ok = $ok + $cO[$p]; - } - if (isset($cN[$p])) { - $ok = array_diff_assoc($ok, $cN[$p]); - } - if (strpos($p, '-')) { - $ok = ['*' => 1, '#pcdata' => 1]; - } - } else { - $ok = $inOk; - unset($cI['del'], $cI['ins']); - } - // bad tags, & ele content - if (isset($e) && (1 === $do || (isset($ok['#pcdata']) && (3 === $do || 5 === $do)))) { - echo '<', $s, $e, $a, '>'; - } - if (isset($x[0])) { - if (strlen(trim($x)) && (($ql && isset($cB[$p])) || (isset($cB[$in]) && !$ql))) { - echo '
', $x, '
'; - } elseif ($do < 3 || isset($ok['#pcdata'])) { - echo $x; - } elseif (strpos($x, "\x02\x04")) { - foreach (preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY) as $v) { - echo "\x01\x02" === substr($v, 0, 2) ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : ''); - } - } elseif ($do > 4) { - echo preg_replace('`\S`', '', $x); - } - } - // get markup - if (!preg_match('`^(/?)([a-z][^ >]*)([^>]*)>(.*)`sm', $t[$i], $r)) { - $x = $t[$i]; - continue; + $x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':') + ? strtolower($C['schemes']) + : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet') + . (empty($C['safe']) + ? ', app, javascript; *: data, javascript, ' + : '; *:') + . 'file, http, https'; + $C['schemes'] = array(); + foreach (explode(';', trim(str_replace(array(' ', "\t", "\r", "\n"), '', $x), ';')) as $v) { + if(strpos($v, ':')) { + list($x, $y) = explode(':', $v, 2); + $C['schemes'][$x] = array_flip(explode(',', $y)); + } + } + if (!isset($C['schemes']['*'])) { + $C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1); + if (empty($C['safe'])) { + $C['schemes']['*'] += array('data'=>1, 'javascript'=>1); + } + } + if (!empty($C['safe']) && empty($C['schemes']['style'])) { + $C['schemes']['style'] = array('!'=>1); + } + $C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0; + if (!isset($C['base_url']) || !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])) { + $C['base_url'] = $C['abs_url'] = 0; + } + + // -- Configure other parameters. + + $C['and_mark'] = empty($C['and_mark']) ? 0 : 1; + $C['anti_link_spam'] = + (isset($C['anti_link_spam']) + && is_array($C['anti_link_spam']) + && count($C['anti_link_spam']) == 2 + && (empty($C['anti_link_spam'][0]) + || hl_regex($C['anti_link_spam'][0])) + && (empty($C['anti_link_spam'][1]) + || hl_regex($C['anti_link_spam'][1]))) + ? $C['anti_link_spam'] + : 0; + $C['anti_mail_spam'] = isset($C['anti_mail_spam']) ? $C['anti_mail_spam'] : 0; + $C['any_custom_element'] = (!isset($C['any_custom_element']) || !empty($C['any_custom_element'])) ? 1 : 0; + $C['balance'] = isset($C['balance']) ? (bool)$C['balance'] : 1; + $C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0); + $C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char']; + $C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0); + $C['css_expression'] = empty($C['css_expression']) ? 0 : 1; + $C['direct_list_nest'] = empty($C['direct_list_nest']) ? 0 : 1; + $C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1; + $C['hook'] = (!empty($C['hook']) && is_callable($C['hook'])) ? $C['hook'] : 0; + $C['hook_tag'] = (!empty($C['hook_tag']) && is_callable($C['hook_tag'])) ? $C['hook_tag'] : 0; + $C['keep_bad'] = isset($C['keep_bad']) ? $C['keep_bad'] : 6; + $C['lc_std_val'] = isset($C['lc_std_val']) ? (bool)$C['lc_std_val'] : 1; + $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 1; + $C['named_entity'] = isset($C['named_entity']) ? (bool)$C['named_entity'] : 1; + $C['no_deprecated_attr'] = isset($C['no_deprecated_attr']) ? $C['no_deprecated_attr'] : 1; + $C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body'; + $C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0; + $C['style_pass'] = empty($C['style_pass']) ? 0 : 1; + $C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy']; + $C['unique_ids'] = isset($C['unique_ids']) && (!preg_match('`\W`', $C['unique_ids'])) ? $C['unique_ids'] : 1; + $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0; + + if (isset($GLOBALS['C'])) { + $oldC = $GLOBALS['C']; + } + $GLOBALS['C'] = $C; + + // Set $S array ($spec). + + $S = is_array($S) ? $S : hl_spec($S); + if (isset($GLOBALS['S'])) { + $oldS = $GLOBALS['S']; + } + $GLOBALS['S'] = $S; + + // Handle characters. + + $t = preg_replace('`[\x00-\x08\x0b-\x0c\x0e-\x1f]`', '', $t); // Remove illegal + if ($C['clean_ms_char']) { // Convert MS Windows CP-1252 + $x = array("\x7f"=>'', "\x80"=>'€', "\x81"=>'', "\x83"=>'ƒ', "\x85"=>'…', "\x86"=>'†', "\x87"=>'‡', "\x88"=>'ˆ', "\x89"=>'‰', "\x8a"=>'Š', "\x8b"=>'‹', "\x8c"=>'Œ', "\x8d"=>'', "\x8e"=>'Ž', "\x8f"=>'', "\x90"=>'', "\x95"=>'•', "\x96"=>'–', "\x97"=>'—', "\x98"=>'˜', "\x99"=>'™', "\x9a"=>'š', "\x9b"=>'›', "\x9c"=>'œ', "\x9d"=>'', "\x9e"=>'ž', "\x9f"=>'Ÿ'); + $x = $x + + ($C['clean_ms_char'] == 1 + ? array("\x82"=>'‚', "\x84"=>'„', "\x91"=>'‘', "\x92"=>'’', "\x93"=>'“', "\x94"=>'”') + : array("\x82"=>'\'', "\x84"=>'"', "\x91"=>'\'', "\x92"=>'\'', "\x93"=>'"', "\x94"=>'"')); + $t = strtr($t, $x); + } + + // Handle CDATA, comments, and entities. + + if ($C['cdata'] || $C['comment']) { + $t = preg_replace_callback('``sm', 'hl_commentCdata', $t); + } + $t = + preg_replace_callback( + '`&([a-zA-Z][a-zA-Z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', + 'hl_entity', + str_replace('&', '&', $t)); + if ($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])) { + $GLOBALS['hl_Ids'] = array(); + } + + if ($C['hook']) { + $t = call_user_func($C['hook'], $t, $C, $S); + } + + // Handle remaining text. + + $t = preg_replace_callback('`<(?:(?:\s|$)|(?:[^>]*(?:>|$)))|>`m', 'hl_tag', $t); + $t = $C['balance'] ? hl_balance($t, $C['keep_bad'], $C['parent']) : $t; + $t = (($C['cdata'] || $C['comment']) && strpos($t, "\x01") !== false) + ? str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05"), array('', '', '&', '<', '>'), $t) + : $t; + $t = $C['tidy'] ? hl_tidy($t, $C['tidy'], $C['parent']) : $t; + + // Cleanup. + + if ($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])) { + $GLOBALS[$C['show_setting']] = array('config'=>$C, 'spec'=>$S, 'time'=>microtime(true), 'version'=>hl_version()); + } + unset($C, $eleAr); + if (isset($oldC)) { + $GLOBALS['C'] = $oldC; + } + if (isset($oldS)) { + $GLOBALS['S'] = $oldS; + } + return $t; +} + +/** + * Validate attribute value and possibly reset to a default. + * + * @param string $attr Attribute name. + * @param string $value Attribute value. + * @param array $ruleAr Array of rules derived from $spec. + * @param string $ele Element. + * @return mixed 0 if invalid $value, + * or string with validated or default value. + */ +function hl_attributeValue($attr, $value, $ruleAr, $ele) +{ + static $spacedValsAttrAr = array('accesskey', 'class', 'itemtype', 'rel'); // Some attributes have multiple values + $valSep = + (in_array($attr, $spacedValsAttrAr) || ($attr == 'archive' && $ele == 'object')) + ? ' ' + : (($attr == 'srcset' || ($attr == 'archive' && $ele == 'applet')) + ? ',' + : ''); + $out = array(); + $valAr = !empty($valSep) ? explode($valSep, $value) : array($value); + foreach ($valAr as $v) { + $ok = 1; + $v = trim($v); + $lengthVal = strlen($v); + foreach ($ruleAr as $ruleType=>$ruleVal) { + if (!$lengthVal) { + continue; + } + switch ($ruleType) { + case 'maxlen': if ($lengthVal > $ruleVal) { + $ok = 0; } - $s = null; - $e = null; - $a = null; - $x = null; - list($all, $s, $e, $a, $x) = $r; - // close tag - if ($s) { - if (isset($cE[$e]) || !in_array($e, $q, true)) { - continue; - } // Empty/unopen - if ($p === $e) { - array_pop($q); - echo ''; - unset($e); - continue; - } // Last open - $add = ''; // Nesting - close open tags that need to be - for ($j = -1, $cj = count($q); ++$j < $cj;) { - if (($d = array_pop($q)) === $e) { - break; - } - $add .= ""; - } - echo $add, ''; - unset($e); - continue; + break; case 'minlen': if ($lengthVal < $ruleVal) { + $ok = 0; } - // open tag - // $cB ele needs $eB ele as child - if (isset($cB[$e]) && strlen(trim($x))) { - $t[$i] = "{$e}{$a}>"; - array_splice($t, $i + 1, 0, 'div>' . $x); - unset($e, $x); - ++$ci; - --$i; - continue; + break; case 'maxval': if ((float)($v) > $ruleVal) { + $ok = 0; } - if (strpos($e, '-')) { - $ok[$e] = 1; + break; case 'minval': if ((float)($v) < $ruleVal) { + $ok = 0; } - if ((($ql && isset($cB[$p])) || (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e]) && !isset($ok['*'])) { - array_splice($t, $i, 0, 'div>'); - unset($e, $x); - ++$ci; - --$i; - continue; + break; case 'match': if (!preg_match($ruleVal, $v)) { + $ok = 0; } - // if no open ele, $in = parent; mostly immediate parent-child relation should hold - if (!$ql || !isset($eN[$e]) || !array_intersect($q, $cN2)) { - if (!isset($ok[$e]) && !isset($ok['*'])) { - if ($ql && isset($cT[$p])) { - echo ''; - unset($e, $x); - --$i; - } - continue; - } - if (!isset($cE[$e])) { - $q[] = $e; - } - echo '<', $e, $a, '>'; - unset($e); - continue; + break; case 'nomatch': if (preg_match($ruleVal, $v)) { + $ok = 0; } - // specific parent-child - if (isset($cS[$p][$e])) { - if (!isset($cE[$e])) { - $q[] = $e; - } - echo '<', $e, $a, '>'; - unset($e); - continue; + break; case 'oneof': if(!in_array($v, explode('|', $ruleVal))) { + $ok = 0; } - // nesting - $add = ''; - $q2 = []; - for ($k = -1, $kc = count($q); ++$k < $kc;) { - $d = $q[$k]; - $ok2 = []; - if (isset($cS[$d])) { - $q2[] = $d; - continue; - } - $ok2 = isset($cI[$d]) ? $eI : $eF; - if (isset($cO[$d])) { - $ok2 = $ok2 + $cO[$d]; - } - if (isset($cN[$d])) { - $ok2 = array_diff_assoc($ok2, $cN[$d]); - } - if (!isset($ok2[$e]) && !strpos($e, '-')) { - if (!$k && !isset($inOk[$e]) && !isset($inOk['*'])) { - continue 2; - } - $add = ""; - for (; ++$k < $kc;) { - $add = "{$add}"; - } - break; - } - $q2[] = $d; + break; case 'noneof': if(in_array($v, explode('|', $ruleVal))) { + $ok = 0; } - $q = $q2; - if (!isset($cE[$e])) { - $q[] = $e; + break; default: + break; + } + if (!$ok) { + break; + } + } + if ($ok) { + $out[] = $v; + } + } + $out = implode($valSep == ',' ? ', ' : ' ', $out); + return (isset($out[0]) ? $out : (isset($ruleAr['default']) ? $ruleAr['default'] : 0)); +} + +/* + * Enforce parent-child validity of elements and balance tags. + * + * @param string $t HTM. Previously partly sanitized/filtered. CDATA + * and comment sections have characters hidden. + * @param int $act $config's keep_bad parameter. + * @param string $parentEle $t's parent element option. + * @return string $t with valid nesting and balanced tags. + */ +function hl_balance($t, $act=1, $parentEle='div') +{ + // Group elements in different ways. + + $closingTagOmitableEleAr = array('caption'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'p'=>1, 'rp'=>1, 'rt'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); + + // -- Block, inline, etc. + + $blockEleAr = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'details'=>1, 'dialog'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hr'=>1, 'isindex'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'section'=>1, 'slot'=>1, 'style'=>1, 'table'=>1, 'template'=>1, 'ul'=>1); + $inlineEleAr = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'cite'=>1, 'code'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'figcaption'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'link'=>1, 'map'=>1, 'mark'=>1, 'meta'=>1, 'meter'=>1, 'object'=>1, 'output'=>1, 'picture'=>1, 'progress'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); + $otherEleAr = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'dd'=>1, 'dt'=>1, 'hgroup'=>1, 'keygen'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'source'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1, 'track'=>1); + $flowEleAr = $blockEleAr + $inlineEleAr; + + // -- Type of child allowed. + + $blockKidEleAr = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); + $flowKidEleAr = array('a'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'button'=>1, 'canvas'=>1, 'del'=>1, 'details'=>1, 'dialog'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'header'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'section'=>1, 'slot'=>1, 'style'=>1, 'td'=>1, 'template'=>1, 'th'=>1, 'video'=>1); // Later context-wise dynamic move of ins & del to $inlineKidEleAr + $inlineKidEleAr = array('abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'data'=>1, 'datalist'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'figcaption'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hgroup'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'mark'=>1, 'meter'=>1, 'output'=>1, 'p'=>1, 'picture'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); + $noKidEleAr = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); + + // Special parent-child relations. + + $invalidMomKidAr = array('a'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'iframe'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'address'=>array('address'=>1, 'article'=>1, 'aside'=>1, 'footer'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'keygen'=>1, 'nav'=>1, 'section'=>1), 'audio'=>array('audio'=>1, 'video'=>1), 'button'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'iframe'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'dfn'=>array('dfn'=>1), 'fieldset'=>array('fieldset'=>1), 'footer'=>array('footer'=>1, 'header'=>1), 'form'=>array('form'=>1), 'header'=>array('footer'=>1, 'header'=>1), 'label'=>array('label'=>1), 'main'=>array('main'=>1), 'meter'=>array('meter'=>1), 'noscript'=>array('script'=>1), 'progress'=>array('progress'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1), 'time'=>array('time'=>1), 'video'=>array('audio'=>1, 'video'=>1)); + $invalidKidEleAr = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'button'=>1, 'details'=>1, 'dfn'=>1, 'embed'=>1, 'fieldset'=>1, 'footer'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'iframe'=>1, 'keygen'=>1, 'label'=>1, 'main'=>1, 'meter'=>1, 'nav'=>1, 'progress'=>1, 'ruby'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'textarea'=>1, 'time'=>1, 'video'=>1); // $invalidMomKidAr values + $invalidMomEleAr = array_keys($invalidMomKidAr); + $validMomKidAr = array('colgroup'=>array('col'=>1, 'template'=>1), 'datalist'=>array('option'=>1, 'script'=>1), 'details'=>array('summary'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'div'=>1, 'dt'=>1), 'hgroup'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1), 'menu'=>array('li'=>1, 'script'=>1, 'template'=>1), 'ol'=>array('li'=>1, 'script'=>1, 'template'=>1), 'optgroup'=>array('option'=>1, 'script'=>1, 'template'=>1), 'option'=>array('#pcdata'=>1), 'picture'=>array('img'=>1, 'script'=>1, 'source'=>1, 'template'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rp'=>1, 'rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, '#pcdata'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'script'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'template'=>1), 'tbody'=>array('script'=>1, 'template'=>1, 'tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('script'=>1, 'template'=>1, 'tr'=>1), 'tr'=>array('script'=>1, 'td'=>1, 'template'=>1, 'th'=>1), 'ul'=>array('li'=>1, 'script'=>1, 'template'=>1)); // Immediate parent-child relation + if ($GLOBALS['C']['direct_list_nest']) { + $validMomKidAr['ol'] = $validMomKidAr['ul'] = $validMomKidAr['menu'] += array('menu'=>1, 'ol'=>1, 'ul'=>1); + } + $otherValidMomKidAr = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'audio'=>array('source'=>1, 'track'=>1), 'blockquote'=>array('script'=>1), 'details'=>array('summary'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'figure'=>array('figcaption'=>1),'form'=>array('script'=>1), 'map'=>array('area'=>1), 'legend'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1), 'object'=>array('param'=>1, 'embed'=>1), 'summary'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hgroup'=>1), 'video'=>array('source'=>1, 'track'=>1)); + + // Valid elements for top-level parent. + + $mom = ((isset($flowEleAr[$parentEle]) && $parentEle != '#pcdata') + || isset($otherEleAr[$parentEle])) + ? $parentEle + : 'div'; + if (isset($noKidEleAr[$mom])) { + return (!$act ? '' : str_replace(array('<', '>'), array('<', '>'), $t)); + } + if (isset($validMomKidAr[$mom])) { + $validInMomEleAr = $validMomKidAr[$mom]; + } elseif (isset($inlineKidEleAr[$mom])) { + $validInMomEleAr = $inlineEleAr; + $inlineKidEleAr['del'] = 1; + $inlineKidEleAr['ins'] = 1; + } elseif (isset($flowKidEleAr[$mom])) { + $validInMomEleAr = $flowEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } elseif (isset($blockKidEleAr[$mom])) { + $validInMomEleAr = $blockEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } + if (isset($otherValidMomKidAr[$mom])) { + $validInMomEleAr = $validInMomEleAr + $otherValidMomKidAr[$mom]; + } + if (isset($invalidMomKidAr[$mom])) { + $validInMomEleAr = array_diff_assoc($validInMomEleAr, $invalidMomKidAr[$mom]); + } + if (strpos($mom, '-')) { // Custom element + $validInMomEleAr = array('*' => 1, '#pcdata' =>1); + } + + // Loop over elements. + + $t = explode('<', $t); + $validKidsOfMom = $openEleQueue = array(); // Queue of opened elements + ob_start(); + for ($i=-1, $eleCount=count($t); ++$i<$eleCount;) { + + // Check element validity as child. Same code as section: Finishing (below). + + if ($queueLength = count($openEleQueue)) { + $eleNow = array_pop($openEleQueue); + $openEleQueue[] = $eleNow; + if (isset($validMomKidAr[$eleNow])) { + $validKidsOfMom = $validMomKidAr[$eleNow]; + } elseif (isset($inlineKidEleAr[$eleNow])) { + $validKidsOfMom = $inlineEleAr; + $inlineKidEleAr['del'] = 1; + $inlineKidEleAr['ins'] = 1; + } elseif (isset($flowKidEleAr[$eleNow])) { + $validKidsOfMom = $flowEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } elseif (isset($blockKidEleAr[$eleNow])) { + $validKidsOfMom = $blockEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } + if (isset($otherValidMomKidAr[$eleNow])) { + $validKidsOfMom = $validKidsOfMom + $otherValidMomKidAr[$eleNow]; + } + if (isset($invalidMomKidAr[$eleNow])) { + $validKidsOfMom = array_diff_assoc($validKidsOfMom, $invalidMomKidAr[$eleNow]); + } + if (strpos($eleNow, '-')) { // Custom element + $validKidsOfMom = array('*'=>1, '#pcdata'=>1); + } + } else { + $validKidsOfMom = $validInMomEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } + if ( + isset($ele) + && ($act == 1 + || (isset($validKidsOfMom['#pcdata']) + && ($act == 3 + || $act == 5))) + ) { + echo '<', $slash, $ele, $attrs, '>'; + } + if (isset($content[0])) { + if (strlen(trim($content)) + && (($queueLength && isset($blockKidEleAr[$eleNow])) + || (isset($blockKidEleAr[$mom]) && !$queueLength)) + ) { + echo '
', $content, '
'; + } elseif ($act < 3 || isset($validKidsOfMom['#pcdata'])) { + echo $content; + } elseif (strpos($content, "\x02\x04")) { + foreach ( + preg_split( + '`(\x01\x02[^\x01\x02]+\x02\x01)`', $content, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $m) { + echo( + substr($m, 0, 2) == "\x01\x02" + ? $m + : ($act > 4 + ? preg_replace('`\S`', '', $m) + : '')); } - echo $add, '<', $e, $a, '>'; - unset($e); + } elseif ($act > 4) { + echo preg_replace('`\S`', '', $content); + } + } // End: Check element validity as child + + // Get parts of element. + + if (!preg_match('`^(/?)([a-z][^ >]*)([^>]*)>(.*)`sm', $t[$i], $m)) { + $content = $t[$i]; + continue; + } + $slash = null; // Closing tag's slash + $ele = null; // Name + $attrs = null; // Attribute string + $content = null; // Content + list($all, $slash, $ele, $attrs, $content) = $m; + + // Handle closing tag. + + if ($slash) { + if (isset($noKidEleAr[$ele]) || !in_array($ele, $openEleQueue)) { // Element empty type or unopened + continue; + } + if ($eleNow == $ele) { // Last open tag + array_pop($openEleQueue); + echo ''; + unset($ele); continue; + } + $closedTags = ''; // Nesting, so close open elements as necessary + for ($j=-1, $cj=count($openEleQueue); ++$j<$cj;) { + if (($closableEle = array_pop($openEleQueue)) == $ele) { + break; + } else { + $closedTags .= ""; + } + } + echo $closedTags, ''; + unset($ele); + continue; } - // end - if ($ql = count($q)) { - $p = array_pop($q); - $q[] = $p; - if (isset($cS[$p])) { - $ok = $cS[$p]; - } elseif (isset($cI[$p])) { - $ok = $eI; - $cI['del'] = 1; - $cI['ins'] = 1; - } elseif (isset($cF[$p])) { - $ok = $eF; - unset($cI['del'], $cI['ins']); - } elseif (isset($cB[$p])) { - $ok = $eB; - unset($cI['del'], $cI['ins']); - } - if (isset($cO[$p])) { - $ok = $ok + $cO[$p]; - } - if (isset($cN[$p])) { - $ok = array_diff_assoc($ok, $cN[$p]); + // Handle opening tag. + + if (isset($blockKidEleAr[$ele]) && strlen(trim($content))) { // $blockKidEleAr element needs $blockEleAr element + $t[$i] = "{$ele}{$attrs}>"; + array_splice($t, $i+1, 0, 'div>'. $content); + unset($ele, $content); + ++$eleCount; + --$i; + continue; + } + if (strpos($ele, '-')) { // Custom element + $validKidsOfMom[$ele] = 1; + } + if ((($queueLength && isset($blockKidEleAr[$eleNow])) + || (isset($blockKidEleAr[$mom]) && !$queueLength)) + && !isset($blockEleAr[$ele]) + && !isset($validKidsOfMom[$ele]) + && !isset($validKidsOfMom['*']) + ) { + array_splice($t, $i, 0, 'div>'); + unset($ele, $content); + ++$eleCount; + --$i; + continue; + } + if ( + !$queueLength + || !isset($invalidKidEleAr[$ele]) + || !array_intersect($openEleQueue, $invalidMomEleAr) + ) { // If no open element; mostly immediate parent-child relation should hold + if (!isset($validKidsOfMom[$ele]) && !isset($validKidsOfMom['*'])) { + if ($queueLength && isset($closingTagOmitableEleAr[$eleNow])) { + echo ''; + unset($ele, $content); + --$i; } - if (strpos($p, '-')) { - $ok = ['*' => 1, '#pcdata' => 1]; + continue; + } + if (!isset($noKidEleAr[$ele])) { + $openEleQueue[] = $ele; + } + echo '<', $ele, $attrs, '>'; + unset($ele); + continue; + } + if (isset($validMomKidAr[$eleNow][$ele])) { // Specific parent-child relation + if (!isset($noKidEleAr[$ele])) { + $openEleQueue[] = $ele; + } + echo '<', $ele, $attrs, '>'; + unset($ele); + continue; + } + $closedTags = ''; // Nesting, so close open elements as needed + $openEleQueue2 = array(); + for ($k=-1, $kc=count($openEleQueue); ++$k<$kc;) { + $closableEle = $openEleQueue[$k]; + $validKids2 = array(); + if (isset($validMomKidAr[$closableEle])) { + $openEleQueue2[] = $closableEle; + continue; + } + $validKids2 = isset($inlineKidEleAr[$closableEle]) ? $inlineEleAr : $flowEleAr; + if (isset($otherValidMomKidAr[$closableEle])) { + $validKids2 = $validKids2 + $otherValidMomKidAr[$closableEle]; + } + if (isset($invalidMomKidAr[$closableEle])) { + $validKids2 = array_diff_assoc($validKids2, $invalidMomKidAr[$closableEle]); + } + if (!isset($validKids2[$ele]) && !strpos($ele, '-')) { + if (!$k && !isset($validInMomEleAr[$ele]) && !isset($validInMomEleAr['*'])) { + continue 2; } - } else { - $ok = $inOk; - unset($cI['del'], $cI['ins']); - } - if (isset($e) && (1 === $do || (isset($ok['#pcdata']) && (3 === $do || 5 === $do)))) { - echo '<', $s, $e, $a, '>'; - } - if (isset($x[0])) { - if (strlen(trim($x)) && (($ql && isset($cB[$p])) || (isset($cB[$in]) && !$ql))) { - echo '
', $x, '
'; - } elseif ($do < 3 || isset($ok['#pcdata'])) { - echo $x; - } elseif (strpos($x, "\x02\x04")) { - foreach (preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY) as $v) { - echo "\x01\x02" === substr($v, 0, 2) ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : ''); - } - } elseif ($do > 4) { - echo preg_replace('`\S`', '', $x); + $closedTags = ""; + for (;++$k<$kc;) { + $closedTags = "{$closedTags}"; } - } - while (!empty($q) && ($e = array_pop($q))) { - echo ''; - } - $o = ob_get_contents(); - ob_end_clean(); + break; + } else { + $openEleQueue2[] = $closableEle; + } + } + $openEleQueue = $openEleQueue2; + if (!isset($noKidEleAr[$ele])) { + $openEleQueue[] = $ele; + } + echo $closedTags, '<', $ele, $attrs, '>'; + unset($ele); + continue; + } // End of For: loop over elements - return $o; -} + // Finishing. Same code as: 'Check element validity as child'. -function hl_cmtcd($t) -{ - // comment/CDATA sec handler - $t = $t[0]; - global $C; - if (!($v = $C[$n = '-' === $t[3] ? 'comment' : 'cdata'])) { - return $t; - } - if (1 === $v) { - return ''; - } - if ('comment' === $n && $v < 4) { - if (' ' !== substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1)) { - $t .= ' '; - } - } else { - $t = substr($t, 1, -1); - } - $t = 2 === $v ? str_replace(['&', '<', '>'], ['&', '<', '>'], $t) : $t; + if ($queueLength = count($openEleQueue)) { + $eleNow = array_pop($openEleQueue); + $openEleQueue[] = $eleNow; + if (isset($validMomKidAr[$eleNow])) { + $validKidsOfMom = $validMomKidAr[$eleNow]; + } elseif (isset($inlineKidEleAr[$eleNow])) { + $validKidsOfMom = $inlineEleAr; + $inlineKidEleAr['del'] = 1; + $inlineKidEleAr['ins'] = 1; + } elseif (isset($flowKidEleAr[$eleNow])) { + $validKidsOfMom = $flowEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } elseif (isset($blockKidEleAr[$eleNow])) { + $validKidsOfMom = $blockEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } + if (isset($otherValidMomKidAr[$eleNow])) { + $validKidsOfMom = $validKidsOfMom + $otherValidMomKidAr[$eleNow]; + } + if (isset($invalidMomKidAr[$eleNow])) { + $validKidsOfMom = array_diff_assoc($validKidsOfMom, $invalidMomKidAr[$eleNow]); + } + if (strpos($eleNow, '-')) { // Custom element + $validKidsOfMom = array('*'=>1, '#pcdata'=>1); + } + } else { + $validKidsOfMom = $validInMomEleAr; + unset($inlineKidEleAr['del'], $inlineKidEleAr['ins']); + } + if ( + isset($ele) + && ($act == 1 + || (isset($validKidsOfMom['#pcdata']) + && ($act == 3 + || $act == 5))) + ) { + echo '<', $slash, $ele, $attrs, '>'; + } + if (isset($content[0])) { + if ( + strlen(trim($content)) + && (($queueLength && isset($blockKidEleAr[$eleNow])) + || (isset($blockKidEleAr[$mom]) && !$queueLength)) + ) { + echo '
', $content, '
'; + } elseif ($act < 3 || isset($validKidsOfMom['#pcdata'])) { + echo $content; + } elseif (strpos($content, "\x02\x04")) { + foreach ( + preg_split( + '`(\x01\x02[^\x01\x02]+\x02\x01)`', $content, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $m) { + echo( + substr($m, 0, 2) == "\x01\x02" + ? $m + : ($act > 4 + ? preg_replace('`\S`', '', $m) + : '')); + } + } elseif ($act > 4) { + echo preg_replace('`\S`', '', $content); + } + } // End: Finishing - return str_replace(['&', '<', '>'], ["\x03", "\x04", "\x05"], ('comment' === $n ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01")); + while (!empty($openEleQueue) && ($ele = array_pop($openEleQueue))) { + echo ''; + } + $o = ob_get_contents(); + ob_end_clean(); + return $o; } -function hl_ent($t) +/** + * Handle comment/CDATA section. + * + * Filter/sanitize as per $config and disguise special characters. + * + * @param array $t Array result of preg_replace, with potential comment/CDATA. + * @return string Sanitized comment/CDATA with hidden special characters. + */ +function hl_commentCdata($t) { - // entitity handler - global $C; - $t = $t[1]; - static $U = ['quot' => 1, 'amp' => 1, 'lt' => 1, 'gt' => 1]; - static $N = ['fnof' => '402', 'Alpha' => '913', 'Beta' => '914', 'Gamma' => '915', 'Delta' => '916', 'Epsilon' => '917', 'Zeta' => '918', 'Eta' => '919', 'Theta' => '920', 'Iota' => '921', 'Kappa' => '922', 'Lambda' => '923', 'Mu' => '924', 'Nu' => '925', 'Xi' => '926', 'Omicron' => '927', 'Pi' => '928', 'Rho' => '929', 'Sigma' => '931', 'Tau' => '932', 'Upsilon' => '933', 'Phi' => '934', 'Chi' => '935', 'Psi' => '936', 'Omega' => '937', 'alpha' => '945', 'beta' => '946', 'gamma' => '947', 'delta' => '948', 'epsilon' => '949', 'zeta' => '950', 'eta' => '951', 'theta' => '952', 'iota' => '953', 'kappa' => '954', 'lambda' => '955', 'mu' => '956', 'nu' => '957', 'xi' => '958', 'omicron' => '959', 'pi' => '960', 'rho' => '961', 'sigmaf' => '962', 'sigma' => '963', 'tau' => '964', 'upsilon' => '965', 'phi' => '966', 'chi' => '967', 'psi' => '968', 'omega' => '969', 'thetasym' => '977', 'upsih' => '978', 'piv' => '982', 'bull' => '8226', 'hellip' => '8230', 'prime' => '8242', 'Prime' => '8243', 'oline' => '8254', 'frasl' => '8260', 'weierp' => '8472', 'image' => '8465', 'real' => '8476', 'trade' => '8482', 'alefsym' => '8501', 'larr' => '8592', 'uarr' => '8593', 'rarr' => '8594', 'darr' => '8595', 'harr' => '8596', 'crarr' => '8629', 'lArr' => '8656', 'uArr' => '8657', 'rArr' => '8658', 'dArr' => '8659', 'hArr' => '8660', 'forall' => '8704', 'part' => '8706', 'exist' => '8707', 'empty' => '8709', 'nabla' => '8711', 'isin' => '8712', 'notin' => '8713', 'ni' => '8715', 'prod' => '8719', 'sum' => '8721', 'minus' => '8722', 'lowast' => '8727', 'radic' => '8730', 'prop' => '8733', 'infin' => '8734', 'ang' => '8736', 'and' => '8743', 'or' => '8744', 'cap' => '8745', 'cup' => '8746', 'int' => '8747', 'there4' => '8756', 'sim' => '8764', 'cong' => '8773', 'asymp' => '8776', 'ne' => '8800', 'equiv' => '8801', 'le' => '8804', 'ge' => '8805', 'sub' => '8834', 'sup' => '8835', 'nsub' => '8836', 'sube' => '8838', 'supe' => '8839', 'oplus' => '8853', 'otimes' => '8855', 'perp' => '8869', 'sdot' => '8901', 'lceil' => '8968', 'rceil' => '8969', 'lfloor' => '8970', 'rfloor' => '8971', 'lang' => '9001', 'rang' => '9002', 'loz' => '9674', 'spades' => '9824', 'clubs' => '9827', 'hearts' => '9829', 'diams' => '9830', 'apos' => '39', 'OElig' => '338', 'oelig' => '339', 'Scaron' => '352', 'scaron' => '353', 'Yuml' => '376', 'circ' => '710', 'tilde' => '732', 'ensp' => '8194', 'emsp' => '8195', 'thinsp' => '8201', 'zwnj' => '8204', 'zwj' => '8205', 'lrm' => '8206', 'rlm' => '8207', 'ndash' => '8211', 'mdash' => '8212', 'lsquo' => '8216', 'rsquo' => '8217', 'sbquo' => '8218', 'ldquo' => '8220', 'rdquo' => '8221', 'bdquo' => '8222', 'dagger' => '8224', 'Dagger' => '8225', 'permil' => '8240', 'lsaquo' => '8249', 'rsaquo' => '8250', 'euro' => '8364', 'nbsp' => '160', 'iexcl' => '161', 'cent' => '162', 'pound' => '163', 'curren' => '164', 'yen' => '165', 'brvbar' => '166', 'sect' => '167', 'uml' => '168', 'copy' => '169', 'ordf' => '170', 'laquo' => '171', 'not' => '172', 'shy' => '173', 'reg' => '174', 'macr' => '175', 'deg' => '176', 'plusmn' => '177', 'sup2' => '178', 'sup3' => '179', 'acute' => '180', 'micro' => '181', 'para' => '182', 'middot' => '183', 'cedil' => '184', 'sup1' => '185', 'ordm' => '186', 'raquo' => '187', 'frac14' => '188', 'frac12' => '189', 'frac34' => '190', 'iquest' => '191', 'Agrave' => '192', 'Aacute' => '193', 'Acirc' => '194', 'Atilde' => '195', 'Auml' => '196', 'Aring' => '197', 'AElig' => '198', 'Ccedil' => '199', 'Egrave' => '200', 'Eacute' => '201', 'Ecirc' => '202', 'Euml' => '203', 'Igrave' => '204', 'Iacute' => '205', 'Icirc' => '206', 'Iuml' => '207', 'ETH' => '208', 'Ntilde' => '209', 'Ograve' => '210', 'Oacute' => '211', 'Ocirc' => '212', 'Otilde' => '213', 'Ouml' => '214', 'times' => '215', 'Oslash' => '216', 'Ugrave' => '217', 'Uacute' => '218', 'Ucirc' => '219', 'Uuml' => '220', 'Yacute' => '221', 'THORN' => '222', 'szlig' => '223', 'agrave' => '224', 'aacute' => '225', 'acirc' => '226', 'atilde' => '227', 'auml' => '228', 'aring' => '229', 'aelig' => '230', 'ccedil' => '231', 'egrave' => '232', 'eacute' => '233', 'ecirc' => '234', 'euml' => '235', 'igrave' => '236', 'iacute' => '237', 'icirc' => '238', 'iuml' => '239', 'eth' => '240', 'ntilde' => '241', 'ograve' => '242', 'oacute' => '243', 'ocirc' => '244', 'otilde' => '245', 'ouml' => '246', 'divide' => '247', 'oslash' => '248', 'ugrave' => '249', 'uacute' => '250', 'ucirc' => '251', 'uuml' => '252', 'yacute' => '253', 'thorn' => '254', 'yuml' => '255']; - if ('#' !== $t[0]) { - return ($C['and_mark'] ? "\x06" : '&') . (isset($U[$t]) ? $t : (isset($N[$t]) ? (!$C['named_entity'] ? '#' . ($C['hexdec_entity'] > 1 ? 'x' . dechex($N[$t]) : $N[$t]) : $t) : 'amp;' . $t)) . ';'; - } - if (($n = ctype_digit($t = substr($t, 1)) ? (int) $t : hexdec(substr($t, 1))) < 9 || ($n > 13 && $n < 32) || 11 === $n || 12 === $n || ($n > 126 && $n < 160 && 133 !== $n) || ($n > 55295 && ($n < 57344 || ($n > 64975 && $n < 64992) || 65534 === $n || 65535 === $n || $n > 1114111))) { - return ($C['and_mark'] ? "\x06" : '&') . "amp;#{$t};"; - } - - return ($C['and_mark'] ? "\x06" : '&') . '#' . (((ctype_digit($t) && $C['hexdec_entity'] < 2) || !$C['hexdec_entity']) ? $n : 'x' . dechex($n)) . ';'; + $t = $t[0]; + global $C; + if (!($rule = $C[$type = $t[3] == '-' ? 'comment' : 'cdata'])) { + return $t; + } + if ($rule == 1) { + return ''; + } + if ($type == 'comment') { + if (substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' ') { + $t .= $rule == 4 ? '' : ' '; + } + } else { + $t = substr($t, 1, -1); + } + $t = $rule == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t; + return + str_replace( + array('&', '<', '>'), + array("\x03", "\x04", "\x05"), + ($type == 'comment' ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01")); } -function hl_prot($p, $c = null) +/** + * Transform deprecated element, with any attribute, into a new element. + * + * + * @param string $ele Deprecated element. + * @param string $attrStr Attribute string of element. + * @param int $act No transformation if 2. + * @return mixed New attribute string (may be empty) or 0. + */ +function hl_deprecatedElement(&$ele, &$attrStr, $act=1) { - // check URL scheme - global $C; - $b = $a = ''; - if (null === $c) { - $c = 'style'; - $b = $p[1]; - $a = $p[3]; - $p = trim($p[2]); - } - $c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*']; - static $d = 'denied:'; - if (isset($c['!']) && substr($p, 0, 7) !== $d) { - $p = "$d$p"; - } - if (isset($c['*']) || !strcspn($p, '#?;') || (substr($p, 0, 7) === $d)) { - return "{$b}{$p}{$a}"; - } // All ok, frag, query, param - if (preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])) { // Denied prot - return "{$b}{$d}{$p}{$a}"; - } - if ($C['abs_url']) { - if (-1 === $C['abs_url'] && 0 === strpos($p, $C['base_url'])) { // Make url rel - $p = substr($p, strlen($C['base_url'])); - } elseif (empty($m[1])) { // Make URL abs - if ('//' === substr($p, 0, 2)) { - $p = substr($C['base_url'], 0, strpos($C['base_url'], ':') + 1) . $p; - } elseif ('/' === $p[0]) { - $p = preg_replace('`(^.+?://[^/]+)(.*)`', '$1', $C['base_url']) . $p; - } elseif (strcspn($p, './')) { - $p = $C['base_url'] . $p; - } else { - preg_match('`^([a-zA-Z\d\-+.]+://[^/]+)(.*)`', $C['base_url'], $m); - $p = preg_replace('`(?<=/)\./`', '', $m[2] . $p); - while (preg_match('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', $p)) { - $p = preg_replace('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', '', $p); - } - $p = $m[1] . $p; - } - } - } + if ($ele == 'big') { + $ele = 'span'; + return 'font-size: larger;'; + } + if ($ele == 's' || $ele == 'strike') { + $ele = 'span'; + return 'text-decoration: line-through;'; + } + if ($ele == 'tt') { + $ele = 'code'; + return ''; + } + if ($ele == 'center') { + $ele = 'div'; + return 'text-align: center;'; + } + static $fontSizeAr = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%'); + if ($ele == 'font') { + $attrStrNew = ''; + while (preg_match('`(^|\s)(color|size)\s*=\s*(\'|")?(.+?)(\\3|\s|$)`i', $attrStr, $m)) { + $attrStr = str_replace($m[0], ' ', $attrStr) ; + $attrStrNew .= + strtolower($m[2]) == 'color' + ? ' color: '. str_replace(array('"', ';', ':'), '\'', trim($m[4])). ';' + : (isset($fontSizeAr[($m = trim($m[4]))]) + ? ' font-size: '. $fontSizeAr[$m]. ';' + : ''); + } + while ( + preg_match('`(^|\s)face\s*=\s*(\'|")?([^=]+?)\\2`i', $attrStr, $m) + || preg_match('`(^|\s)face\s*=(\s*)(\S+)`i', $attrStr, $m) + ) { + $attrStr = str_replace($m[0], ' ', $attrStr) ; + $attrStrNew .= ' font-family: '. str_replace(array('"', ';', ':'), '\'', trim($m[3])). ';'; + } + $ele = 'span'; + return ltrim(str_replace('<', '', $attrStrNew)); + } + if ($ele == 'acronym') { + $ele = 'abbr'; + return ''; + } + if ($ele == 'dir') { + $ele = 'ul'; + return ''; + } + if ($act == 2) { + $ele = 0; + return 0; + } + return ''; +} - return "{$b}{$p}{$a}"; +/** + * Handle entity. + * + * As needed, convert to named/hexadecimal form, or neutralize '&' as '&'. + * + * @param array $t Array result of preg_replace, with potential entity. + * @return string Neutralized or converted entity. + */ +function hl_entity($t) +{ + global $C; + $t = $t[1]; + static $reservedEntAr = array('amp'=>1, 'AMP'=>1, 'gt'=>1, 'GT'=>1, 'lt'=>1, 'LT'=>1, 'quot'=>1, 'QUOT'=>1); + static $commonEntNameAr = array('Aacute'=>'193', 'aacute'=>'225', 'Acirc'=>'194', 'acirc'=>'226', 'acute'=>'180', 'AElig'=>'198', 'aelig'=>'230', 'Agrave'=>'192', 'agrave'=>'224', 'alefsym'=>'8501', 'Alpha'=>'913', 'alpha'=>'945', 'and'=>'8743', 'ang'=>'8736', 'apos'=>'39', 'Aring'=>'197', 'aring'=>'229', 'asymp'=>'8776', 'Atilde'=>'195', 'atilde'=>'227', 'Auml'=>'196', 'auml'=>'228', 'bdquo'=>'8222', 'Beta'=>'914', 'beta'=>'946', 'brvbar'=>'166', 'bull'=>'8226', 'cap'=>'8745', 'Ccedil'=>'199', 'ccedil'=>'231', 'cedil'=>'184', 'cent'=>'162', 'Chi'=>'935', 'chi'=>'967', 'circ'=>'710', 'clubs'=>'9827', 'cong'=>'8773', 'copy'=>'169', 'crarr'=>'8629', 'cup'=>'8746', 'curren'=>'164', 'dagger'=>'8224', 'Dagger'=>'8225', 'darr'=>'8595', 'dArr'=>'8659', 'deg'=>'176', 'Delta'=>'916', 'delta'=>'948', 'diams'=>'9830', 'divide'=>'247', 'Eacute'=>'201', 'eacute'=>'233', 'Ecirc'=>'202', 'ecirc'=>'234', 'Egrave'=>'200', 'egrave'=>'232', 'empty'=>'8709', 'emsp'=>'8195', 'ensp'=>'8194', 'Epsilon'=>'917', 'epsilon'=>'949', 'equiv'=>'8801', 'Eta'=>'919', 'eta'=>'951', 'ETH'=>'208', 'eth'=>'240', 'Euml'=>'203', 'euml'=>'235', 'euro'=>'8364', 'exist'=>'8707', 'fnof'=>'402', 'forall'=>'8704', 'frac12'=>'189', 'frac14'=>'188', 'frac34'=>'190', 'frasl'=>'8260', 'Gamma'=>'915', 'gamma'=>'947', 'ge'=>'8805', 'harr'=>'8596', 'hArr'=>'8660', 'hearts'=>'9829', 'hellip'=>'8230', 'Iacute'=>'205', 'iacute'=>'237', 'Icirc'=>'206', 'icirc'=>'238', 'iexcl'=>'161', 'Igrave'=>'204', 'igrave'=>'236', 'image'=>'8465', 'infin'=>'8734', 'int'=>'8747', 'Iota'=>'921', 'iota'=>'953', 'iquest'=>'191', 'isin'=>'8712', 'Iuml'=>'207', 'iuml'=>'239', 'Kappa'=>'922', 'kappa'=>'954', 'Lambda'=>'923', 'lambda'=>'955', 'laquo'=>'171', 'larr'=>'8592', 'lArr'=>'8656', 'lceil'=>'8968', 'ldquo'=>'8220', 'le'=>'8804', 'lfloor'=>'8970', 'lowast'=>'8727', 'loz'=>'9674', 'lrm'=>'8206', 'lsaquo'=>'8249', 'lsquo'=>'8216', 'macr'=>'175', 'mdash'=>'8212', 'micro'=>'181', 'middot'=>'183', 'minus'=>'8722', 'Mu'=>'924', 'mu'=>'956', 'nabla'=>'8711', 'nbsp'=>'160', 'ndash'=>'8211', 'ne'=>'8800', 'ni'=>'8715', 'not'=>'172', 'notin'=>'8713', 'nsub'=>'8836', 'Ntilde'=>'209', 'ntilde'=>'241', 'Nu'=>'925', 'nu'=>'957', 'Oacute'=>'211', 'oacute'=>'243', 'Ocirc'=>'212', 'ocirc'=>'244', 'OElig'=>'338', 'oelig'=>'339', 'Ograve'=>'210', 'ograve'=>'242', 'oline'=>'8254', 'Omega'=>'937', 'omega'=>'969', 'Omicron'=>'927', 'omicron'=>'959', 'oplus'=>'8853', 'or'=>'8744', 'ordf'=>'170', 'ordm'=>'186', 'Oslash'=>'216', 'oslash'=>'248', 'Otilde'=>'213', 'otilde'=>'245', 'otimes'=>'8855', 'Ouml'=>'214', 'ouml'=>'246', 'para'=>'182', 'part'=>'8706', 'permil'=>'8240', 'perp'=>'8869', 'Phi'=>'934', 'phi'=>'966', 'Pi'=>'928', 'pi'=>'960', 'piv'=>'982', 'plusmn'=>'177', 'pound'=>'163', 'prime'=>'8242', 'Prime'=>'8243', 'prod'=>'8719', 'prop'=>'8733', 'Psi'=>'936', 'psi'=>'968', 'radic'=>'8730', 'raquo'=>'187', 'rarr'=>'8594', 'rArr'=>'8658', 'rceil'=>'8969', 'rdquo'=>'8221', 'real'=>'8476', 'reg'=>'174', 'rfloor'=>'8971', 'Rho'=>'929', 'rho'=>'961', 'rlm'=>'8207', 'rsaquo'=>'8250', 'rsquo'=>'8217', 'sbquo'=>'8218', 'Scaron'=>'352', 'scaron'=>'353', 'sdot'=>'8901', 'sect'=>'167', 'shy'=>'173', 'Sigma'=>'931', 'sigma'=>'963', 'sigmaf'=>'962', 'sim'=>'8764', 'spades'=>'9824', 'sub'=>'8834', 'sube'=>'8838', 'sum'=>'8721', 'sup'=>'8835', 'sup1'=>'185', 'sup2'=>'178', 'sup3'=>'179', 'supe'=>'8839', 'szlig'=>'223', 'Tau'=>'932', 'tau'=>'964', 'there4'=>'8756', 'Theta'=>'920', 'theta'=>'952', 'thetasym'=>'977', 'thinsp'=>'8201', 'THORN'=>'222', 'thorn'=>'254', 'tilde'=>'732', 'times'=>'215', 'trade'=>'8482', 'Uacute'=>'218', 'uacute'=>'250', 'uarr'=>'8593', 'uArr'=>'8657', 'Ucirc'=>'219', 'ucirc'=>'251', 'Ugrave'=>'217', 'ugrave'=>'249', 'uml'=>'168', 'upsih'=>'978', 'Upsilon'=>'933', 'upsilon'=>'965', 'Uuml'=>'220', 'uuml'=>'252', 'weierp'=>'8472', 'Xi'=>'926', 'xi'=>'958', 'Yacute'=>'221', 'yacute'=>'253', 'yen'=>'165', 'yuml'=>'255', 'Yuml'=>'376', 'Zeta'=>'918', 'zeta'=>'950', 'zwj'=>'8205', 'zwnj'=>'8204'); + static $rareEntNameAr = array('Abreve'=>'258', 'abreve'=>'259', 'ac'=>'8766', 'acd'=>'8767', 'Acy'=>'1040', 'acy'=>'1072', 'af'=>'8289', 'Afr'=>'120068', 'afr'=>'120094', 'aleph'=>'8501', 'Amacr'=>'256', 'amacr'=>'257', 'amalg'=>'10815', 'And'=>'10835', 'andand'=>'10837', 'andd'=>'10844', 'andslope'=>'10840', 'andv'=>'10842', 'ange'=>'10660', 'angle'=>'8736', 'angmsd'=>'8737', 'angmsdaa'=>'10664', 'angmsdab'=>'10665', 'angmsdac'=>'10666', 'angmsdad'=>'10667', 'angmsdae'=>'10668', 'angmsdaf'=>'10669', 'angmsdag'=>'10670', 'angmsdah'=>'10671', 'angrt'=>'8735', 'angrtvb'=>'8894', 'angrtvbd'=>'10653', 'angsph'=>'8738', 'angst'=>'197', 'angzarr'=>'9084', 'Aogon'=>'260', 'aogon'=>'261', 'Aopf'=>'120120', 'aopf'=>'120146', 'ap'=>'8776', 'apacir'=>'10863', 'apE'=>'10864', 'ape'=>'8778', 'apid'=>'8779', 'ApplyFunction'=>'8289', 'approx'=>'8776', 'approxeq'=>'8778', 'Ascr'=>'119964', 'ascr'=>'119990', 'Assign'=>'8788', 'ast'=>'42', 'asympeq'=>'8781', 'awconint'=>'8755', 'awint'=>'10769', 'backcong'=>'8780', 'backepsilon'=>'1014', 'backprime'=>'8245', 'backsim'=>'8765', 'backsimeq'=>'8909', 'Backslash'=>'8726', 'Barv'=>'10983', 'barvee'=>'8893', 'barwed'=>'8965', 'Barwed'=>'8966', 'barwedge'=>'8965', 'bbrk'=>'9141', 'bbrktbrk'=>'9142', 'bcong'=>'8780', 'Bcy'=>'1041', 'bcy'=>'1073', 'becaus'=>'8757', 'because'=>'8757', 'Because'=>'8757', 'bemptyv'=>'10672', 'bepsi'=>'1014', 'bernou'=>'8492', 'Bernoullis'=>'8492', 'beth'=>'8502', 'between'=>'8812', 'Bfr'=>'120069', 'bfr'=>'120095', 'bigcap'=>'8898', 'bigcirc'=>'9711', 'bigcup'=>'8899', 'bigodot'=>'10752', 'bigoplus'=>'10753', 'bigotimes'=>'10754', 'bigsqcup'=>'10758', 'bigstar'=>'9733', 'bigtriangledown'=>'9661', 'bigtriangleup'=>'9651', 'biguplus'=>'10756', 'bigvee'=>'8897', 'bigwedge'=>'8896', 'bkarow'=>'10509', 'blacklozenge'=>'10731', 'blacksquare'=>'9642', 'blacktriangle'=>'9652', 'blacktriangledown'=>'9662', 'blacktriangleleft'=>'9666', 'blacktriangleright'=>'9656', 'blank'=>'9251', 'blk12'=>'9618', 'blk14'=>'9617', 'blk34'=>'9619', 'block'=>'9608', 'bNot'=>'10989', 'bnot'=>'8976', 'Bopf'=>'120121', 'bopf'=>'120147', 'bot'=>'8869', 'bottom'=>'8869', 'bowtie'=>'8904', 'boxbox'=>'10697', 'boxdl'=>'9488', 'boxdL'=>'9557', 'boxDl'=>'9558', 'boxDL'=>'9559', 'boxdr'=>'9484', 'boxdR'=>'9554', 'boxDr'=>'9555', 'boxDR'=>'9556', 'boxh'=>'9472', 'boxH'=>'9552', 'boxhd'=>'9516', 'boxHd'=>'9572', 'boxhD'=>'9573', 'boxHD'=>'9574', 'boxhu'=>'9524', 'boxHu'=>'9575', 'boxhU'=>'9576', 'boxHU'=>'9577', 'boxminus'=>'8863', 'boxplus'=>'8862', 'boxtimes'=>'8864', 'boxul'=>'9496', 'boxuL'=>'9563', 'boxUl'=>'9564', 'boxUL'=>'9565', 'boxur'=>'9492', 'boxuR'=>'9560', 'boxUr'=>'9561', 'boxUR'=>'9562', 'boxv'=>'9474', 'boxV'=>'9553', 'boxvh'=>'9532', 'boxvH'=>'9578', 'boxVh'=>'9579', 'boxVH'=>'9580', 'boxvl'=>'9508', 'boxvL'=>'9569', 'boxVl'=>'9570', 'boxVL'=>'9571', 'boxvr'=>'9500', 'boxvR'=>'9566', 'boxVr'=>'9567', 'boxVR'=>'9568', 'bprime'=>'8245', 'breve'=>'728', 'Breve'=>'728', 'bscr'=>'119991', 'Bscr'=>'8492', 'bsemi'=>'8271', 'bsim'=>'8765', 'bsime'=>'8909', 'bsol'=>'92', 'bsolb'=>'10693', 'bsolhsub'=>'10184', 'bullet'=>'8226', 'bump'=>'8782', 'bumpE'=>'10926', 'bumpe'=>'8783', 'Bumpeq'=>'8782', 'bumpeq'=>'8783', 'Cacute'=>'262', 'cacute'=>'263', 'Cap'=>'8914', 'capand'=>'10820', 'capbrcup'=>'10825', 'capcap'=>'10827', 'capcup'=>'10823', 'capdot'=>'10816', 'CapitalDifferentialD'=>'8517', 'caret'=>'8257', 'caron'=>'711', 'Cayleys'=>'8493', 'ccaps'=>'10829', 'Ccaron'=>'268', 'ccaron'=>'269', 'Ccirc'=>'264', 'ccirc'=>'265', 'Cconint'=>'8752', 'ccups'=>'10828', 'ccupssm'=>'10832', 'Cdot'=>'266', 'cdot'=>'267', 'Cedilla'=>'184', 'cemptyv'=>'10674', 'centerdot'=>'183', 'CenterDot'=>'183', 'cfr'=>'120096', 'Cfr'=>'8493', 'CHcy'=>'1063', 'chcy'=>'1095', 'check'=>'10003', 'checkmark'=>'10003', 'cir'=>'9675', 'circeq'=>'8791', 'circlearrowleft'=>'8634', 'circlearrowright'=>'8635', 'circledast'=>'8859', 'circledcirc'=>'8858', 'circleddash'=>'8861', 'CircleDot'=>'8857', 'circledR'=>'174', 'circledS'=>'9416', 'CircleMinus'=>'8854', 'CirclePlus'=>'8853', 'CircleTimes'=>'8855', 'cirE'=>'10691', 'cire'=>'8791', 'cirfnint'=>'10768', 'cirmid'=>'10991', 'cirscir'=>'10690', 'ClockwiseContourIntegral'=>'8754', 'CloseCurlyDoubleQuote'=>'8221', 'CloseCurlyQuote'=>'8217', 'clubsuit'=>'9827', 'colon'=>'58', 'Colon'=>'8759', 'Colone'=>'10868', 'colone'=>'8788', 'coloneq'=>'8788', 'comma'=>'44', 'commat'=>'64', 'comp'=>'8705', 'compfn'=>'8728', 'complement'=>'8705', 'complexes'=>'8450', 'congdot'=>'10861', 'Congruent'=>'8801', 'conint'=>'8750', 'Conint'=>'8751', 'ContourIntegral'=>'8750', 'copf'=>'120148', 'Copf'=>'8450', 'coprod'=>'8720', 'Coproduct'=>'8720', 'COPY'=>'169', 'copysr'=>'8471', 'CounterClockwiseContourIntegral'=>'8755', 'cross'=>'10007', 'Cross'=>'10799', 'Cscr'=>'119966', 'cscr'=>'119992', 'csub'=>'10959', 'csube'=>'10961', 'csup'=>'10960', 'csupe'=>'10962', 'ctdot'=>'8943', 'cudarrl'=>'10552', 'cudarrr'=>'10549', 'cuepr'=>'8926', 'cuesc'=>'8927', 'cularr'=>'8630', 'cularrp'=>'10557', 'Cup'=>'8915', 'cupbrcap'=>'10824', 'cupcap'=>'10822', 'CupCap'=>'8781', 'cupcup'=>'10826', 'cupdot'=>'8845', 'cupor'=>'10821', 'curarr'=>'8631', 'curarrm'=>'10556', 'curlyeqprec'=>'8926', 'curlyeqsucc'=>'8927', 'curlyvee'=>'8910', 'curlywedge'=>'8911', 'curvearrowleft'=>'8630', 'curvearrowright'=>'8631', 'cuvee'=>'8910', 'cuwed'=>'8911', 'cwconint'=>'8754', 'cwint'=>'8753', 'cylcty'=>'9005', 'daleth'=>'8504', 'Darr'=>'8609', 'dash'=>'8208', 'Dashv'=>'10980', 'dashv'=>'8867', 'dbkarow'=>'10511', 'dblac'=>'733', 'Dcaron'=>'270', 'dcaron'=>'271', 'Dcy'=>'1044', 'dcy'=>'1076', 'DD'=>'8517', 'dd'=>'8518', 'ddagger'=>'8225', 'ddarr'=>'8650', 'DDotrahd'=>'10513', 'ddotseq'=>'10871', 'Del'=>'8711', 'demptyv'=>'10673', 'dfisht'=>'10623', 'Dfr'=>'120071', 'dfr'=>'120097', 'dHar'=>'10597', 'dharl'=>'8643', 'dharr'=>'8642', 'DiacriticalAcute'=>'180', 'DiacriticalDot'=>'729', 'DiacriticalDoubleAcute'=>'733', 'DiacriticalGrave'=>'96', 'DiacriticalTilde'=>'732', 'diam'=>'8900', 'diamond'=>'8900', 'Diamond'=>'8900', 'diamondsuit'=>'9830', 'die'=>'168', 'DifferentialD'=>'8518', 'digamma'=>'989', 'disin'=>'8946', 'div'=>'247', 'divideontimes'=>'8903', 'divonx'=>'8903', 'DJcy'=>'1026', 'djcy'=>'1106', 'dlcorn'=>'8990', 'dlcrop'=>'8973', 'dollar'=>'36', 'Dopf'=>'120123', 'dopf'=>'120149', 'Dot'=>'168', 'dot'=>'729', 'DotDot'=>'8412', 'doteq'=>'8784', 'doteqdot'=>'8785', 'DotEqual'=>'8784', 'dotminus'=>'8760', 'dotplus'=>'8724', 'dotsquare'=>'8865', 'doublebarwedge'=>'8966', 'DoubleContourIntegral'=>'8751', 'DoubleDot'=>'168', 'DoubleDownArrow'=>'8659', 'DoubleLeftArrow'=>'8656', 'DoubleLeftRightArrow'=>'8660', 'DoubleLeftTee'=>'10980', 'DoubleLongLeftArrow'=>'10232', 'DoubleLongLeftRightArrow'=>'10234', 'DoubleLongRightArrow'=>'10233', 'DoubleRightArrow'=>'8658', 'DoubleRightTee'=>'8872', 'DoubleUpArrow'=>'8657', 'DoubleUpDownArrow'=>'8661', 'DoubleVerticalBar'=>'8741', 'downarrow'=>'8595', 'DownArrow'=>'8595', 'Downarrow'=>'8659', 'DownArrowBar'=>'10515', 'DownArrowUpArrow'=>'8693', 'DownBreve'=>'785', 'downdownarrows'=>'8650', 'downharpoonleft'=>'8643', 'downharpoonright'=>'8642', 'DownLeftRightVector'=>'10576', 'DownLeftTeeVector'=>'10590', 'DownLeftVector'=>'8637', 'DownLeftVectorBar'=>'10582', 'DownRightTeeVector'=>'10591', 'DownRightVector'=>'8641', 'DownRightVectorBar'=>'10583', 'DownTee'=>'8868', 'DownTeeArrow'=>'8615', 'drbkarow'=>'10512', 'drcorn'=>'8991', 'drcrop'=>'8972', 'Dscr'=>'119967', 'dscr'=>'119993', 'DScy'=>'1029', 'dscy'=>'1109', 'dsol'=>'10742', 'Dstrok'=>'272', 'dstrok'=>'273', 'dtdot'=>'8945', 'dtri'=>'9663', 'dtrif'=>'9662', 'duarr'=>'8693', 'duhar'=>'10607', 'dwangle'=>'10662', 'DZcy'=>'1039', 'dzcy'=>'1119', 'dzigrarr'=>'10239', 'easter'=>'10862', 'Ecaron'=>'282', 'ecaron'=>'283', 'ecir'=>'8790', 'ecolon'=>'8789', 'Ecy'=>'1069', 'ecy'=>'1101', 'eDDot'=>'10871', 'Edot'=>'278', 'edot'=>'279', 'eDot'=>'8785', 'ee'=>'8519', 'efDot'=>'8786', 'Efr'=>'120072', 'efr'=>'120098', 'eg'=>'10906', 'egs'=>'10902', 'egsdot'=>'10904', 'el'=>'10905', 'Element'=>'8712', 'elinters'=>'9191', 'ell'=>'8467', 'els'=>'10901', 'elsdot'=>'10903', 'Emacr'=>'274', 'emacr'=>'275', 'emptyset'=>'8709', 'EmptySmallSquare'=>'9723', 'emptyv'=>'8709', 'EmptyVerySmallSquare'=>'9643', 'emsp13'=>'8196', 'emsp14'=>'8197', 'ENG'=>'330', 'eng'=>'331', 'Eogon'=>'280', 'eogon'=>'281', 'Eopf'=>'120124', 'eopf'=>'120150', 'epar'=>'8917', 'eparsl'=>'10723', 'eplus'=>'10865', 'epsi'=>'949', 'epsiv'=>'1013', 'eqcirc'=>'8790', 'eqcolon'=>'8789', 'eqsim'=>'8770', 'eqslantgtr'=>'10902', 'eqslantless'=>'10901', 'Equal'=>'10869', 'equals'=>'61', 'EqualTilde'=>'8770', 'equest'=>'8799', 'Equilibrium'=>'8652', 'equivDD'=>'10872', 'eqvparsl'=>'10725', 'erarr'=>'10609', 'erDot'=>'8787', 'escr'=>'8495', 'Escr'=>'8496', 'esdot'=>'8784', 'Esim'=>'10867', 'esim'=>'8770', 'excl'=>'33', 'Exists'=>'8707', 'expectation'=>'8496', 'exponentiale'=>'8519', 'ExponentialE'=>'8519', 'fallingdotseq'=>'8786', 'Fcy'=>'1060', 'fcy'=>'1092', 'female'=>'9792', 'ffilig'=>'64259', 'fflig'=>'64256', 'ffllig'=>'64260', 'Ffr'=>'120073', 'ffr'=>'120099', 'filig'=>'64257', 'FilledSmallSquare'=>'9724', 'FilledVerySmallSquare'=>'9642', 'flat'=>'9837', 'fllig'=>'64258', 'fltns'=>'9649', 'Fopf'=>'120125', 'fopf'=>'120151', 'ForAll'=>'8704', 'fork'=>'8916', 'forkv'=>'10969', 'Fouriertrf'=>'8497', 'fpartint'=>'10765', 'frac13'=>'8531', 'frac15'=>'8533', 'frac16'=>'8537', 'frac18'=>'8539', 'frac23'=>'8532', 'frac25'=>'8534', 'frac35'=>'8535', 'frac38'=>'8540', 'frac45'=>'8536', 'frac56'=>'8538', 'frac58'=>'8541', 'frac78'=>'8542', 'frown'=>'8994', 'fscr'=>'119995', 'Fscr'=>'8497', 'gacute'=>'501', 'Gammad'=>'988', 'gammad'=>'989', 'gap'=>'10886', 'Gbreve'=>'286', 'gbreve'=>'287', 'Gcedil'=>'290', 'Gcirc'=>'284', 'gcirc'=>'285', 'Gcy'=>'1043', 'gcy'=>'1075', 'Gdot'=>'288', 'gdot'=>'289', 'gE'=>'8807', 'gEl'=>'10892', 'gel'=>'8923', 'geq'=>'8805', 'geqq'=>'8807', 'geqslant'=>'10878', 'ges'=>'10878', 'gescc'=>'10921', 'gesdot'=>'10880', 'gesdoto'=>'10882', 'gesdotol'=>'10884', 'gesles'=>'10900', 'Gfr'=>'120074', 'gfr'=>'120100', 'gg'=>'8811', 'Gg'=>'8921', 'ggg'=>'8921', 'gimel'=>'8503', 'GJcy'=>'1027', 'gjcy'=>'1107', 'gl'=>'8823', 'gla'=>'10917', 'glE'=>'10898', 'glj'=>'10916', 'gnap'=>'10890', 'gnapprox'=>'10890', 'gne'=>'10888', 'gnE'=>'8809', 'gneq'=>'10888', 'gneqq'=>'8809', 'gnsim'=>'8935', 'Gopf'=>'120126', 'gopf'=>'120152', 'grave'=>'96', 'GreaterEqual'=>'8805', 'GreaterEqualLess'=>'8923', 'GreaterFullEqual'=>'8807', 'GreaterGreater'=>'10914', 'GreaterLess'=>'8823', 'GreaterSlantEqual'=>'10878', 'GreaterTilde'=>'8819', 'Gscr'=>'119970', 'gscr'=>'8458', 'gsim'=>'8819', 'gsime'=>'10894', 'gsiml'=>'10896', 'Gt'=>'8811', 'gtcc'=>'10919', 'gtcir'=>'10874', 'gtdot'=>'8919', 'gtlPar'=>'10645', 'gtquest'=>'10876', 'gtrapprox'=>'10886', 'gtrarr'=>'10616', 'gtrdot'=>'8919', 'gtreqless'=>'8923', 'gtreqqless'=>'10892', 'gtrless'=>'8823', 'gtrsim'=>'8819', 'Hacek'=>'711', 'hairsp'=>'8202', 'half'=>'189', 'hamilt'=>'8459', 'HARDcy'=>'1066', 'hardcy'=>'1098', 'harrcir'=>'10568', 'harrw'=>'8621', 'Hat'=>'94', 'hbar'=>'8463', 'Hcirc'=>'292', 'hcirc'=>'293', 'heartsuit'=>'9829', 'hercon'=>'8889', 'hfr'=>'120101', 'Hfr'=>'8460', 'HilbertSpace'=>'8459', 'hksearow'=>'10533', 'hkswarow'=>'10534', 'hoarr'=>'8703', 'homtht'=>'8763', 'hookleftarrow'=>'8617', 'hookrightarrow'=>'8618', 'hopf'=>'120153', 'Hopf'=>'8461', 'horbar'=>'8213', 'HorizontalLine'=>'9472', 'hscr'=>'119997', 'Hscr'=>'8459', 'hslash'=>'8463', 'Hstrok'=>'294', 'hstrok'=>'295', 'HumpDownHump'=>'8782', 'HumpEqual'=>'8783', 'hybull'=>'8259', 'hyphen'=>'8208', 'ic'=>'8291', 'Icy'=>'1048', 'icy'=>'1080', 'Idot'=>'304', 'IEcy'=>'1045', 'iecy'=>'1077', 'iff'=>'8660', 'ifr'=>'120102', 'Ifr'=>'8465', 'ii'=>'8520', 'iiiint'=>'10764', 'iiint'=>'8749', 'iinfin'=>'10716', 'iiota'=>'8489', 'IJlig'=>'306', 'ijlig'=>'307', 'Im'=>'8465', 'Imacr'=>'298', 'imacr'=>'299', 'ImaginaryI'=>'8520', 'imagline'=>'8464', 'imagpart'=>'8465', 'imath'=>'305', 'imof'=>'8887', 'imped'=>'437', 'Implies'=>'8658', 'in'=>'8712', 'incare'=>'8453', 'infintie'=>'10717', 'inodot'=>'305', 'Int'=>'8748', 'intcal'=>'8890', 'integers'=>'8484', 'Integral'=>'8747', 'intercal'=>'8890', 'Intersection'=>'8898', 'intlarhk'=>'10775', 'intprod'=>'10812', 'InvisibleComma'=>'8291', 'InvisibleTimes'=>'8290', 'IOcy'=>'1025', 'iocy'=>'1105', 'Iogon'=>'302', 'iogon'=>'303', 'Iopf'=>'120128', 'iopf'=>'120154', 'iprod'=>'10812', 'iscr'=>'119998', 'Iscr'=>'8464', 'isindot'=>'8949', 'isinE'=>'8953', 'isins'=>'8948', 'isinsv'=>'8947', 'isinv'=>'8712', 'it'=>'8290', 'Itilde'=>'296', 'itilde'=>'297', 'Iukcy'=>'1030', 'iukcy'=>'1110', 'Jcirc'=>'308', 'jcirc'=>'309', 'Jcy'=>'1049', 'jcy'=>'1081', 'Jfr'=>'120077', 'jfr'=>'120103', 'jmath'=>'567', 'Jopf'=>'120129', 'jopf'=>'120155', 'Jscr'=>'119973', 'jscr'=>'119999', 'Jsercy'=>'1032', 'jsercy'=>'1112', 'Jukcy'=>'1028', 'jukcy'=>'1108', 'kappav'=>'1008', 'Kcedil'=>'310', 'kcedil'=>'311', 'Kcy'=>'1050', 'kcy'=>'1082', 'Kfr'=>'120078', 'kfr'=>'120104', 'kgreen'=>'312', 'KHcy'=>'1061', 'khcy'=>'1093', 'KJcy'=>'1036', 'kjcy'=>'1116', 'Kopf'=>'120130', 'kopf'=>'120156', 'Kscr'=>'119974', 'kscr'=>'120000', 'lAarr'=>'8666', 'Lacute'=>'313', 'lacute'=>'314', 'laemptyv'=>'10676', 'lagran'=>'8466', 'lang'=>'10216', 'Lang'=>'10218', 'langd'=>'10641', 'langle'=>'10216', 'lap'=>'10885', 'Laplacetrf'=>'8466', 'Larr'=>'8606', 'larrb'=>'8676', 'larrbfs'=>'10527', 'larrfs'=>'10525', 'larrhk'=>'8617', 'larrlp'=>'8619', 'larrpl'=>'10553', 'larrsim'=>'10611', 'larrtl'=>'8610', 'lat'=>'10923', 'latail'=>'10521', 'lAtail'=>'10523', 'late'=>'10925', 'lbarr'=>'10508', 'lBarr'=>'10510', 'lbbrk'=>'10098', 'lbrace'=>'123', 'lbrack'=>'91', 'lbrke'=>'10635', 'lbrksld'=>'10639', 'lbrkslu'=>'10637', 'Lcaron'=>'317', 'lcaron'=>'318', 'Lcedil'=>'315', 'lcedil'=>'316', 'lcub'=>'123', 'Lcy'=>'1051', 'lcy'=>'1083', 'ldca'=>'10550', 'ldquor'=>'8222', 'ldrdhar'=>'10599', 'ldrushar'=>'10571', 'ldsh'=>'8626', 'lE'=>'8806', 'LeftAngleBracket'=>'10216', 'leftarrow'=>'8592', 'LeftArrow'=>'8592', 'Leftarrow'=>'8656', 'LeftArrowBar'=>'8676', 'LeftArrowRightArrow'=>'8646', 'leftarrowtail'=>'8610', 'LeftCeiling'=>'8968', 'LeftDoubleBracket'=>'10214', 'LeftDownTeeVector'=>'10593', 'LeftDownVector'=>'8643', 'LeftDownVectorBar'=>'10585', 'LeftFloor'=>'8970', 'leftharpoondown'=>'8637', 'leftharpoonup'=>'8636', 'leftleftarrows'=>'8647', 'leftrightarrow'=>'8596', 'LeftRightArrow'=>'8596', 'Leftrightarrow'=>'8660', 'leftrightarrows'=>'8646', 'leftrightharpoons'=>'8651', 'leftrightsquigarrow'=>'8621', 'LeftRightVector'=>'10574', 'LeftTee'=>'8867', 'LeftTeeArrow'=>'8612', 'LeftTeeVector'=>'10586', 'leftthreetimes'=>'8907', 'LeftTriangle'=>'8882', 'LeftTriangleBar'=>'10703', 'LeftTriangleEqual'=>'8884', 'LeftUpDownVector'=>'10577', 'LeftUpTeeVector'=>'10592', 'LeftUpVector'=>'8639', 'LeftUpVectorBar'=>'10584', 'LeftVector'=>'8636', 'LeftVectorBar'=>'10578', 'lEg'=>'10891', 'leg'=>'8922', 'leq'=>'8804', 'leqq'=>'8806', 'leqslant'=>'10877', 'les'=>'10877', 'lescc'=>'10920', 'lesdot'=>'10879', 'lesdoto'=>'10881', 'lesdotor'=>'10883', 'lesges'=>'10899', 'lessapprox'=>'10885', 'lessdot'=>'8918', 'lesseqgtr'=>'8922', 'lesseqqgtr'=>'10891', 'LessEqualGreater'=>'8922', 'LessFullEqual'=>'8806', 'LessGreater'=>'8822', 'lessgtr'=>'8822', 'LessLess'=>'10913', 'lesssim'=>'8818', 'LessSlantEqual'=>'10877', 'LessTilde'=>'8818', 'lfisht'=>'10620', 'Lfr'=>'120079', 'lfr'=>'120105', 'lg'=>'8822', 'lgE'=>'10897', 'lHar'=>'10594', 'lhard'=>'8637', 'lharu'=>'8636', 'lharul'=>'10602', 'lhblk'=>'9604', 'LJcy'=>'1033', 'ljcy'=>'1113', 'll'=>'8810', 'Ll'=>'8920', 'llarr'=>'8647', 'llcorner'=>'8990', 'Lleftarrow'=>'8666', 'llhard'=>'10603', 'lltri'=>'9722', 'Lmidot'=>'319', 'lmidot'=>'320', 'lmoust'=>'9136', 'lmoustache'=>'9136', 'lnap'=>'10889', 'lnapprox'=>'10889', 'lne'=>'10887', 'lnE'=>'8808', 'lneq'=>'10887', 'lneqq'=>'8808', 'lnsim'=>'8934', 'loang'=>'10220', 'loarr'=>'8701', 'lobrk'=>'10214', 'longleftarrow'=>'10229', 'LongLeftArrow'=>'10229', 'Longleftarrow'=>'10232', 'longleftrightarrow'=>'10231', 'LongLeftRightArrow'=>'10231', 'Longleftrightarrow'=>'10234', 'longmapsto'=>'10236', 'longrightarrow'=>'10230', 'LongRightArrow'=>'10230', 'Longrightarrow'=>'10233', 'looparrowleft'=>'8619', 'looparrowright'=>'8620', 'lopar'=>'10629', 'Lopf'=>'120131', 'lopf'=>'120157', 'loplus'=>'10797', 'lotimes'=>'10804', 'lowbar'=>'95', 'LowerLeftArrow'=>'8601', 'LowerRightArrow'=>'8600', 'lozenge'=>'9674', 'lozf'=>'10731', 'lpar'=>'40', 'lparlt'=>'10643', 'lrarr'=>'8646', 'lrcorner'=>'8991', 'lrhar'=>'8651', 'lrhard'=>'10605', 'lrtri'=>'8895', 'lscr'=>'120001', 'Lscr'=>'8466', 'lsh'=>'8624', 'Lsh'=>'8624', 'lsim'=>'8818', 'lsime'=>'10893', 'lsimg'=>'10895', 'lsqb'=>'91', 'lsquor'=>'8218', 'Lstrok'=>'321', 'lstrok'=>'322', 'Lt'=>'8810', 'ltcc'=>'10918', 'ltcir'=>'10873', 'ltdot'=>'8918', 'lthree'=>'8907', 'ltimes'=>'8905', 'ltlarr'=>'10614', 'ltquest'=>'10875', 'ltri'=>'9667', 'ltrie'=>'8884', 'ltrif'=>'9666', 'ltrPar'=>'10646', 'lurdshar'=>'10570', 'luruhar'=>'10598', 'male'=>'9794', 'malt'=>'10016', 'maltese'=>'10016', 'Map'=>'10501', 'map'=>'8614', 'mapsto'=>'8614', 'mapstodown'=>'8615', 'mapstoleft'=>'8612', 'mapstoup'=>'8613', 'marker'=>'9646', 'mcomma'=>'10793', 'Mcy'=>'1052', 'mcy'=>'1084', 'mDDot'=>'8762', 'measuredangle'=>'8737', 'MediumSpace'=>'8287', 'Mellintrf'=>'8499', 'Mfr'=>'120080', 'mfr'=>'120106', 'mho'=>'8487', 'mid'=>'8739', 'midast'=>'42', 'midcir'=>'10992', 'minusb'=>'8863', 'minusd'=>'8760', 'minusdu'=>'10794', 'MinusPlus'=>'8723', 'mlcp'=>'10971', 'mldr'=>'8230', 'mnplus'=>'8723', 'models'=>'8871', 'Mopf'=>'120132', 'mopf'=>'120158', 'mp'=>'8723', 'mscr'=>'120002', 'Mscr'=>'8499', 'mstpos'=>'8766', 'multimap'=>'8888', 'mumap'=>'8888', 'Nacute'=>'323', 'nacute'=>'324', 'nap'=>'8777', 'napos'=>'329', 'napprox'=>'8777', 'natur'=>'9838', 'natural'=>'9838', 'naturals'=>'8469', 'ncap'=>'10819', 'Ncaron'=>'327', 'ncaron'=>'328', 'Ncedil'=>'325', 'ncedil'=>'326', 'ncong'=>'8775', 'ncup'=>'10818', 'Ncy'=>'1053', 'ncy'=>'1085', 'nearhk'=>'10532', 'nearr'=>'8599', 'neArr'=>'8663', 'nearrow'=>'8599', 'NegativeMediumSpace'=>'8203', 'NegativeThickSpace'=>'8203', 'NegativeThinSpace'=>'8203', 'NegativeVeryThinSpace'=>'8203', 'nequiv'=>'8802', 'nesear'=>'10536', 'NestedGreaterGreater'=>'8811', 'NestedLessLess'=>'8810', 'NewLine'=>'10', 'nexist'=>'8708', 'nexists'=>'8708', 'Nfr'=>'120081', 'nfr'=>'120107', 'nge'=>'8817', 'ngeq'=>'8817', 'ngsim'=>'8821', 'ngt'=>'8815', 'ngtr'=>'8815', 'nharr'=>'8622', 'nhArr'=>'8654', 'nhpar'=>'10994', 'nis'=>'8956', 'nisd'=>'8954', 'niv'=>'8715', 'NJcy'=>'1034', 'njcy'=>'1114', 'nlarr'=>'8602', 'nlArr'=>'8653', 'nldr'=>'8229', 'nle'=>'8816', 'nleftarrow'=>'8602', 'nLeftarrow'=>'8653', 'nleftrightarrow'=>'8622', 'nLeftrightarrow'=>'8654', 'nleq'=>'8816', 'nless'=>'8814', 'nlsim'=>'8820', 'nlt'=>'8814', 'nltri'=>'8938', 'nltrie'=>'8940', 'nmid'=>'8740', 'NoBreak'=>'8288', 'NonBreakingSpace'=>'160', 'nopf'=>'120159', 'Nopf'=>'8469', 'Not'=>'10988', 'NotCongruent'=>'8802', 'NotCupCap'=>'8813', 'NotDoubleVerticalBar'=>'8742', 'NotElement'=>'8713', 'NotEqual'=>'8800', 'NotExists'=>'8708', 'NotGreater'=>'8815', 'NotGreaterEqual'=>'8817', 'NotGreaterLess'=>'8825', 'NotGreaterTilde'=>'8821', 'notinva'=>'8713', 'notinvb'=>'8951', 'notinvc'=>'8950', 'NotLeftTriangle'=>'8938', 'NotLeftTriangleEqual'=>'8940', 'NotLess'=>'8814', 'NotLessEqual'=>'8816', 'NotLessGreater'=>'8824', 'NotLessTilde'=>'8820', 'notni'=>'8716', 'notniva'=>'8716', 'notnivb'=>'8958', 'notnivc'=>'8957', 'NotPrecedes'=>'8832', 'NotPrecedesSlantEqual'=>'8928', 'NotReverseElement'=>'8716', 'NotRightTriangle'=>'8939', 'NotRightTriangleEqual'=>'8941', 'NotSquareSubsetEqual'=>'8930', 'NotSquareSupersetEqual'=>'8931', 'NotSubsetEqual'=>'8840', 'NotSucceeds'=>'8833', 'NotSucceedsSlantEqual'=>'8929', 'NotSupersetEqual'=>'8841', 'NotTilde'=>'8769', 'NotTildeEqual'=>'8772', 'NotTildeFullEqual'=>'8775', 'NotTildeTilde'=>'8777', 'NotVerticalBar'=>'8740', 'npar'=>'8742', 'nparallel'=>'8742', 'npolint'=>'10772', 'npr'=>'8832', 'nprcue'=>'8928', 'nprec'=>'8832', 'nrarr'=>'8603', 'nrArr'=>'8655', 'nrightarrow'=>'8603', 'nRightarrow'=>'8655', 'nrtri'=>'8939', 'nrtrie'=>'8941', 'nsc'=>'8833', 'nsccue'=>'8929', 'Nscr'=>'119977', 'nscr'=>'120003', 'nshortmid'=>'8740', 'nshortparallel'=>'8742', 'nsim'=>'8769', 'nsime'=>'8772', 'nsimeq'=>'8772', 'nsmid'=>'8740', 'nspar'=>'8742', 'nsqsube'=>'8930', 'nsqsupe'=>'8931', 'nsube'=>'8840', 'nsubseteq'=>'8840', 'nsucc'=>'8833', 'nsup'=>'8837', 'nsupe'=>'8841', 'nsupseteq'=>'8841', 'ntgl'=>'8825', 'ntlg'=>'8824', 'ntriangleleft'=>'8938', 'ntrianglelefteq'=>'8940', 'ntriangleright'=>'8939', 'ntrianglerighteq'=>'8941', 'num'=>'35', 'numero'=>'8470', 'numsp'=>'8199', 'nvdash'=>'8876', 'nvDash'=>'8877', 'nVdash'=>'8878', 'nVDash'=>'8879', 'nvHarr'=>'10500', 'nvinfin'=>'10718', 'nvlArr'=>'10498', 'nvrArr'=>'10499', 'nwarhk'=>'10531', 'nwarr'=>'8598', 'nwArr'=>'8662', 'nwarrow'=>'8598', 'nwnear'=>'10535', 'oast'=>'8859', 'ocir'=>'8858', 'Ocy'=>'1054', 'ocy'=>'1086', 'odash'=>'8861', 'Odblac'=>'336', 'odblac'=>'337', 'odiv'=>'10808', 'odot'=>'8857', 'odsold'=>'10684', 'ofcir'=>'10687', 'Ofr'=>'120082', 'ofr'=>'120108', 'ogon'=>'731', 'ogt'=>'10689', 'ohbar'=>'10677', 'ohm'=>'937', 'oint'=>'8750', 'olarr'=>'8634', 'olcir'=>'10686', 'olcross'=>'10683', 'olt'=>'10688', 'Omacr'=>'332', 'omacr'=>'333', 'omid'=>'10678', 'ominus'=>'8854', 'Oopf'=>'120134', 'oopf'=>'120160', 'opar'=>'10679', 'OpenCurlyDoubleQuote'=>'8220', 'OpenCurlyQuote'=>'8216', 'operp'=>'10681', 'Or'=>'10836', 'orarr'=>'8635', 'ord'=>'10845', 'order'=>'8500', 'orderof'=>'8500', 'origof'=>'8886', 'oror'=>'10838', 'orslope'=>'10839', 'orv'=>'10843', 'oS'=>'9416', 'Oscr'=>'119978', 'oscr'=>'8500', 'osol'=>'8856', 'Otimes'=>'10807', 'otimesas'=>'10806', 'ovbar'=>'9021', 'OverBar'=>'8254', 'OverBrace'=>'9182', 'OverBracket'=>'9140', 'OverParenthesis'=>'9180', 'par'=>'8741', 'parallel'=>'8741', 'parsim'=>'10995', 'parsl'=>'11005', 'PartialD'=>'8706', 'Pcy'=>'1055', 'pcy'=>'1087', 'percnt'=>'37', 'period'=>'46', 'pertenk'=>'8241', 'Pfr'=>'120083', 'pfr'=>'120109', 'phiv'=>'981', 'phmmat'=>'8499', 'phone'=>'9742', 'pitchfork'=>'8916', 'planck'=>'8463', 'planckh'=>'8462', 'plankv'=>'8463', 'plus'=>'43', 'plusacir'=>'10787', 'plusb'=>'8862', 'pluscir'=>'10786', 'plusdo'=>'8724', 'plusdu'=>'10789', 'pluse'=>'10866', 'PlusMinus'=>'177', 'plussim'=>'10790', 'plustwo'=>'10791', 'pm'=>'177', 'Poincareplane'=>'8460', 'pointint'=>'10773', 'popf'=>'120161', 'Popf'=>'8473', 'Pr'=>'10939', 'pr'=>'8826', 'prap'=>'10935', 'prcue'=>'8828', 'pre'=>'10927', 'prE'=>'10931', 'prec'=>'8826', 'precapprox'=>'10935', 'preccurlyeq'=>'8828', 'Precedes'=>'8826', 'PrecedesEqual'=>'10927', 'PrecedesSlantEqual'=>'8828', 'PrecedesTilde'=>'8830', 'preceq'=>'10927', 'precnapprox'=>'10937', 'precneqq'=>'10933', 'precnsim'=>'8936', 'precsim'=>'8830', 'primes'=>'8473', 'prnap'=>'10937', 'prnE'=>'10933', 'prnsim'=>'8936', 'Product'=>'8719', 'profalar'=>'9006', 'profline'=>'8978', 'profsurf'=>'8979', 'Proportion'=>'8759', 'Proportional'=>'8733', 'propto'=>'8733', 'prsim'=>'8830', 'prurel'=>'8880', 'Pscr'=>'119979', 'pscr'=>'120005', 'puncsp'=>'8200', 'Qfr'=>'120084', 'qfr'=>'120110', 'qint'=>'10764', 'qopf'=>'120162', 'Qopf'=>'8474', 'qprime'=>'8279', 'Qscr'=>'119980', 'qscr'=>'120006', 'quaternions'=>'8461', 'quatint'=>'10774', 'quest'=>'63', 'questeq'=>'8799', 'rAarr'=>'8667', 'Racute'=>'340', 'racute'=>'341', 'raemptyv'=>'10675', 'rang'=>'10217', 'Rang'=>'10219', 'rangd'=>'10642', 'range'=>'10661', 'rangle'=>'10217', 'Rarr'=>'8608', 'rarrap'=>'10613', 'rarrb'=>'8677', 'rarrbfs'=>'10528', 'rarrc'=>'10547', 'rarrfs'=>'10526', 'rarrhk'=>'8618', 'rarrlp'=>'8620', 'rarrpl'=>'10565', 'rarrsim'=>'10612', 'Rarrtl'=>'10518', 'rarrtl'=>'8611', 'rarrw'=>'8605', 'ratail'=>'10522', 'rAtail'=>'10524', 'ratio'=>'8758', 'rationals'=>'8474', 'rbarr'=>'10509', 'rBarr'=>'10511', 'RBarr'=>'10512', 'rbbrk'=>'10099', 'rbrace'=>'125', 'rbrack'=>'93', 'rbrke'=>'10636', 'rbrksld'=>'10638', 'rbrkslu'=>'10640', 'Rcaron'=>'344', 'rcaron'=>'345', 'Rcedil'=>'342', 'rcedil'=>'343', 'rcub'=>'125', 'Rcy'=>'1056', 'rcy'=>'1088', 'rdca'=>'10551', 'rdldhar'=>'10601', 'rdquor'=>'8221', 'rdsh'=>'8627', 'Re'=>'8476', 'realine'=>'8475', 'realpart'=>'8476', 'reals'=>'8477', 'rect'=>'9645', 'REG'=>'174', 'ReverseElement'=>'8715', 'ReverseEquilibrium'=>'8651', 'ReverseUpEquilibrium'=>'10607', 'rfisht'=>'10621', 'rfr'=>'120111', 'Rfr'=>'8476', 'rHar'=>'10596', 'rhard'=>'8641', 'rharu'=>'8640', 'rharul'=>'10604', 'rhov'=>'1009', 'RightAngleBracket'=>'10217', 'rightarrow'=>'8594', 'RightArrow'=>'8594', 'Rightarrow'=>'8658', 'RightArrowBar'=>'8677', 'RightArrowLeftArrow'=>'8644', 'rightarrowtail'=>'8611', 'RightCeiling'=>'8969', 'RightDoubleBracket'=>'10215', 'RightDownTeeVector'=>'10589', 'RightDownVector'=>'8642', 'RightDownVectorBar'=>'10581', 'RightFloor'=>'8971', 'rightharpoondown'=>'8641', 'rightharpoonup'=>'8640', 'rightleftarrows'=>'8644', 'rightleftharpoons'=>'8652', 'rightrightarrows'=>'8649', 'rightsquigarrow'=>'8605', 'RightTee'=>'8866', 'RightTeeArrow'=>'8614', 'RightTeeVector'=>'10587', 'rightthreetimes'=>'8908', 'RightTriangle'=>'8883', 'RightTriangleBar'=>'10704', 'RightTriangleEqual'=>'8885', 'RightUpDownVector'=>'10575', 'RightUpTeeVector'=>'10588', 'RightUpVector'=>'8638', 'RightUpVectorBar'=>'10580', 'RightVector'=>'8640', 'RightVectorBar'=>'10579', 'ring'=>'730', 'risingdotseq'=>'8787', 'rlarr'=>'8644', 'rlhar'=>'8652', 'rmoust'=>'9137', 'rmoustache'=>'9137', 'rnmid'=>'10990', 'roang'=>'10221', 'roarr'=>'8702', 'robrk'=>'10215', 'ropar'=>'10630', 'ropf'=>'120163', 'Ropf'=>'8477', 'roplus'=>'10798', 'rotimes'=>'10805', 'RoundImplies'=>'10608', 'rpar'=>'41', 'rpargt'=>'10644', 'rppolint'=>'10770', 'rrarr'=>'8649', 'Rrightarrow'=>'8667', 'rscr'=>'120007', 'Rscr'=>'8475', 'rsh'=>'8625', 'Rsh'=>'8625', 'rsqb'=>'93', 'rsquor'=>'8217', 'rthree'=>'8908', 'rtimes'=>'8906', 'rtri'=>'9657', 'rtrie'=>'8885', 'rtrif'=>'9656', 'rtriltri'=>'10702', 'RuleDelayed'=>'10740', 'ruluhar'=>'10600', 'rx'=>'8478', 'Sacute'=>'346', 'sacute'=>'347', 'Sc'=>'10940', 'sc'=>'8827', 'scap'=>'10936', 'sccue'=>'8829', 'sce'=>'10928', 'scE'=>'10932', 'Scedil'=>'350', 'scedil'=>'351', 'Scirc'=>'348', 'scirc'=>'349', 'scnap'=>'10938', 'scnE'=>'10934', 'scnsim'=>'8937', 'scpolint'=>'10771', 'scsim'=>'8831', 'Scy'=>'1057', 'scy'=>'1089', 'sdotb'=>'8865', 'sdote'=>'10854', 'searhk'=>'10533', 'searr'=>'8600', 'seArr'=>'8664', 'searrow'=>'8600', 'semi'=>'59', 'seswar'=>'10537', 'setminus'=>'8726', 'setmn'=>'8726', 'sext'=>'10038', 'Sfr'=>'120086', 'sfr'=>'120112', 'sfrown'=>'8994', 'sharp'=>'9839', 'SHCHcy'=>'1065', 'shchcy'=>'1097', 'SHcy'=>'1064', 'shcy'=>'1096', 'ShortDownArrow'=>'8595', 'ShortLeftArrow'=>'8592', 'shortmid'=>'8739', 'shortparallel'=>'8741', 'ShortRightArrow'=>'8594', 'ShortUpArrow'=>'8593', 'sigmav'=>'962', 'simdot'=>'10858', 'sime'=>'8771', 'simeq'=>'8771', 'simg'=>'10910', 'simgE'=>'10912', 'siml'=>'10909', 'simlE'=>'10911', 'simne'=>'8774', 'simplus'=>'10788', 'simrarr'=>'10610', 'slarr'=>'8592', 'SmallCircle'=>'8728', 'smallsetminus'=>'8726', 'smashp'=>'10803', 'smeparsl'=>'10724', 'smid'=>'8739', 'smile'=>'8995', 'smt'=>'10922', 'smte'=>'10924', 'SOFTcy'=>'1068', 'softcy'=>'1100', 'sol'=>'47', 'solb'=>'10692', 'solbar'=>'9023', 'Sopf'=>'120138', 'sopf'=>'120164', 'spadesuit'=>'9824', 'spar'=>'8741', 'sqcap'=>'8851', 'sqcup'=>'8852', 'Sqrt'=>'8730', 'sqsub'=>'8847', 'sqsube'=>'8849', 'sqsubset'=>'8847', 'sqsubseteq'=>'8849', 'sqsup'=>'8848', 'sqsupe'=>'8850', 'sqsupset'=>'8848', 'sqsupseteq'=>'8850', 'squ'=>'9633', 'square'=>'9633', 'Square'=>'9633', 'SquareIntersection'=>'8851', 'SquareSubset'=>'8847', 'SquareSubsetEqual'=>'8849', 'SquareSuperset'=>'8848', 'SquareSupersetEqual'=>'8850', 'SquareUnion'=>'8852', 'squarf'=>'9642', 'squf'=>'9642', 'srarr'=>'8594', 'Sscr'=>'119982', 'sscr'=>'120008', 'ssetmn'=>'8726', 'ssmile'=>'8995', 'sstarf'=>'8902', 'Star'=>'8902', 'star'=>'9734', 'starf'=>'9733', 'straightepsilon'=>'1013', 'straightphi'=>'981', 'strns'=>'175', 'Sub'=>'8912', 'subdot'=>'10941', 'subE'=>'10949', 'subedot'=>'10947', 'submult'=>'10945', 'subnE'=>'10955', 'subne'=>'8842', 'subplus'=>'10943', 'subrarr'=>'10617', 'subset'=>'8834', 'Subset'=>'8912', 'subseteq'=>'8838', 'subseteqq'=>'10949', 'SubsetEqual'=>'8838', 'subsetneq'=>'8842', 'subsetneqq'=>'10955', 'subsim'=>'10951', 'subsub'=>'10965', 'subsup'=>'10963', 'succ'=>'8827', 'succapprox'=>'10936', 'succcurlyeq'=>'8829', 'Succeeds'=>'8827', 'SucceedsEqual'=>'10928', 'SucceedsSlantEqual'=>'8829', 'SucceedsTilde'=>'8831', 'succeq'=>'10928', 'succnapprox'=>'10938', 'succneqq'=>'10934', 'succnsim'=>'8937', 'succsim'=>'8831', 'SuchThat'=>'8715', 'Sum'=>'8721', 'sung'=>'9834', 'Sup'=>'8913', 'supdot'=>'10942', 'supdsub'=>'10968', 'supE'=>'10950', 'supedot'=>'10948', 'Superset'=>'8835', 'SupersetEqual'=>'8839', 'suphsol'=>'10185', 'suphsub'=>'10967', 'suplarr'=>'10619', 'supmult'=>'10946', 'supnE'=>'10956', 'supne'=>'8843', 'supplus'=>'10944', 'supset'=>'8835', 'Supset'=>'8913', 'supseteq'=>'8839', 'supseteqq'=>'10950', 'supsetneq'=>'8843', 'supsetneqq'=>'10956', 'supsim'=>'10952', 'supsub'=>'10964', 'supsup'=>'10966', 'swarhk'=>'10534', 'swarr'=>'8601', 'swArr'=>'8665', 'swarrow'=>'8601', 'swnwar'=>'10538', 'Tab'=>'9', 'target'=>'8982', 'tbrk'=>'9140', 'Tcaron'=>'356', 'tcaron'=>'357', 'Tcedil'=>'354', 'tcedil'=>'355', 'Tcy'=>'1058', 'tcy'=>'1090', 'tdot'=>'8411', 'telrec'=>'8981', 'Tfr'=>'120087', 'tfr'=>'120113', 'therefore'=>'8756', 'Therefore'=>'8756', 'thetav'=>'977', 'thickapprox'=>'8776', 'thicksim'=>'8764', 'ThinSpace'=>'8201', 'thkap'=>'8776', 'thksim'=>'8764', 'Tilde'=>'8764', 'TildeEqual'=>'8771', 'TildeFullEqual'=>'8773', 'TildeTilde'=>'8776', 'timesb'=>'8864', 'timesbar'=>'10801', 'timesd'=>'10800', 'tint'=>'8749', 'toea'=>'10536', 'top'=>'8868', 'topbot'=>'9014', 'topcir'=>'10993', 'Topf'=>'120139', 'topf'=>'120165', 'topfork'=>'10970', 'tosa'=>'10537', 'tprime'=>'8244', 'TRADE'=>'8482', 'triangle'=>'9653', 'triangledown'=>'9663', 'triangleleft'=>'9667', 'trianglelefteq'=>'8884', 'triangleq'=>'8796', 'triangleright'=>'9657', 'trianglerighteq'=>'8885', 'tridot'=>'9708', 'trie'=>'8796', 'triminus'=>'10810', 'TripleDot'=>'8411', 'triplus'=>'10809', 'trisb'=>'10701', 'tritime'=>'10811', 'trpezium'=>'9186', 'Tscr'=>'119983', 'tscr'=>'120009', 'TScy'=>'1062', 'tscy'=>'1094', 'TSHcy'=>'1035', 'tshcy'=>'1115', 'Tstrok'=>'358', 'tstrok'=>'359', 'twixt'=>'8812', 'twoheadleftarrow'=>'8606', 'twoheadrightarrow'=>'8608', 'Uarr'=>'8607', 'Uarrocir'=>'10569', 'Ubrcy'=>'1038', 'ubrcy'=>'1118', 'Ubreve'=>'364', 'ubreve'=>'365', 'Ucy'=>'1059', 'ucy'=>'1091', 'udarr'=>'8645', 'Udblac'=>'368', 'udblac'=>'369', 'udhar'=>'10606', 'ufisht'=>'10622', 'Ufr'=>'120088', 'ufr'=>'120114', 'uHar'=>'10595', 'uharl'=>'8639', 'uharr'=>'8638', 'uhblk'=>'9600', 'ulcorn'=>'8988', 'ulcorner'=>'8988', 'ulcrop'=>'8975', 'ultri'=>'9720', 'Umacr'=>'362', 'umacr'=>'363', 'UnderBar'=>'95', 'UnderBrace'=>'9183', 'UnderBracket'=>'9141', 'UnderParenthesis'=>'9181', 'Union'=>'8899', 'UnionPlus'=>'8846', 'Uogon'=>'370', 'uogon'=>'371', 'Uopf'=>'120140', 'uopf'=>'120166', 'uparrow'=>'8593', 'UpArrow'=>'8593', 'Uparrow'=>'8657', 'UpArrowBar'=>'10514', 'UpArrowDownArrow'=>'8645', 'updownarrow'=>'8597', 'UpDownArrow'=>'8597', 'Updownarrow'=>'8661', 'UpEquilibrium'=>'10606', 'upharpoonleft'=>'8639', 'upharpoonright'=>'8638', 'uplus'=>'8846', 'UpperLeftArrow'=>'8598', 'UpperRightArrow'=>'8599', 'upsi'=>'965', 'Upsi'=>'978', 'UpTee'=>'8869', 'UpTeeArrow'=>'8613', 'upuparrows'=>'8648', 'urcorn'=>'8989', 'urcorner'=>'8989', 'urcrop'=>'8974', 'Uring'=>'366', 'uring'=>'367', 'urtri'=>'9721', 'Uscr'=>'119984', 'uscr'=>'120010', 'utdot'=>'8944', 'Utilde'=>'360', 'utilde'=>'361', 'utri'=>'9653', 'utrif'=>'9652', 'uuarr'=>'8648', 'uwangle'=>'10663', 'vangrt'=>'10652', 'varepsilon'=>'1013', 'varkappa'=>'1008', 'varnothing'=>'8709', 'varphi'=>'981', 'varpi'=>'982', 'varpropto'=>'8733', 'varr'=>'8597', 'vArr'=>'8661', 'varrho'=>'1009', 'varsigma'=>'962', 'vartheta'=>'977', 'vartriangleleft'=>'8882', 'vartriangleright'=>'8883', 'vBar'=>'10984', 'Vbar'=>'10987', 'vBarv'=>'10985', 'Vcy'=>'1042', 'vcy'=>'1074', 'vdash'=>'8866', 'vDash'=>'8872', 'Vdash'=>'8873', 'VDash'=>'8875', 'Vdashl'=>'10982', 'vee'=>'8744', 'Vee'=>'8897', 'veebar'=>'8891', 'veeeq'=>'8794', 'vellip'=>'8942', 'verbar'=>'124', 'Verbar'=>'8214', 'vert'=>'124', 'Vert'=>'8214', 'VerticalBar'=>'8739', 'VerticalLine'=>'124', 'VerticalSeparator'=>'10072', 'VerticalTilde'=>'8768', 'VeryThinSpace'=>'8202', 'Vfr'=>'120089', 'vfr'=>'120115', 'vltri'=>'8882', 'Vopf'=>'120141', 'vopf'=>'120167', 'vprop'=>'8733', 'vrtri'=>'8883', 'Vscr'=>'119985', 'vscr'=>'120011', 'Vvdash'=>'8874', 'vzigzag'=>'10650', 'Wcirc'=>'372', 'wcirc'=>'373', 'wedbar'=>'10847', 'wedge'=>'8743', 'Wedge'=>'8896', 'wedgeq'=>'8793', 'Wfr'=>'120090', 'wfr'=>'120116', 'Wopf'=>'120142', 'wopf'=>'120168', 'wp'=>'8472', 'wr'=>'8768', 'wreath'=>'8768', 'Wscr'=>'119986', 'wscr'=>'120012', 'xcap'=>'8898', 'xcirc'=>'9711', 'xcup'=>'8899', 'xdtri'=>'9661', 'Xfr'=>'120091', 'xfr'=>'120117', 'xharr'=>'10231', 'xhArr'=>'10234', 'xlarr'=>'10229', 'xlArr'=>'10232', 'xmap'=>'10236', 'xnis'=>'8955', 'xodot'=>'10752', 'Xopf'=>'120143', 'xopf'=>'120169', 'xoplus'=>'10753', 'xotime'=>'10754', 'xrarr'=>'10230', 'xrArr'=>'10233', 'Xscr'=>'119987', 'xscr'=>'120013', 'xsqcup'=>'10758', 'xuplus'=>'10756', 'xutri'=>'9651', 'xvee'=>'8897', 'xwedge'=>'8896', 'YAcy'=>'1071', 'yacy'=>'1103', 'Ycirc'=>'374', 'ycirc'=>'375', 'Ycy'=>'1067', 'ycy'=>'1099', 'Yfr'=>'120092', 'yfr'=>'120118', 'YIcy'=>'1031', 'yicy'=>'1111', 'Yopf'=>'120144', 'yopf'=>'120170', 'Yscr'=>'119988', 'yscr'=>'120014', 'YUcy'=>'1070', 'yucy'=>'1102', 'Zacute'=>'377', 'zacute'=>'378', 'Zcaron'=>'381', 'zcaron'=>'382', 'Zcy'=>'1047', 'zcy'=>'1079', 'Zdot'=>'379', 'zdot'=>'380', 'zeetrf'=>'8488', 'ZeroWidthSpace'=>'8203', 'zfr'=>'120119', 'Zfr'=>'8488', 'ZHcy'=>'1046', 'zhcy'=>'1078', 'zigrarr'=>'8669', 'zopf'=>'120171', 'Zopf'=>'8484', 'Zscr'=>'119989', 'zscr'=>'120015'); + if ($t[0] != '#') { + return + ($C['and_mark'] ? "\x06" : '&') + . (isset($reservedEntAr[$t]) + ? $t + : (isset($commonEntNameAr[$t]) + ? (!$C['named_entity'] + ? '#'. ($C['hexdec_entity'] > 1 + ? 'x'. dechex($commonEntNameAr[$t]) + : $commonEntNameAr[$t]) + : $t) + : (isset($rareEntNameAr[$t]) + ? (!$C['named_entity'] + ? '#'. ($C['hexdec_entity'] > 1 + ? 'x'. dechex($rareEntNameAr[$t]) + : $rareEntNameAr[$t]) + : $t) + : 'amp;'. $t))) + . ';'; + } + if ( + ($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) < 9 + || ($n > 13 && $n < 32) + || $n == 11 + || $n == 12 + || ($n > 126 && $n < 160 && $n != 133) + || ($n > 55295 + && ($n < 57344 + || ($n > 64975 && $n < 64992) + || $n == 65534 + || $n == 65535 + || $n > 1114111)) + ) { + return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};"; + } + return + ($C['and_mark'] ? "\x06" : '&') + . '#' + . (((ctype_digit($t) && $C['hexdec_entity'] < 2) + || !$C['hexdec_entity']) + ? $n + : 'x'. dechex($n)) + . ';'; } -function hl_regex($p) +/** + * Check regex pattern for PHP error. + * + * @param string $t Pattern including limiters/modifiers. + * @return int 0 or 1 if pattern is invalid or valid, respectively. + */ +function hl_regex($t) { - // check regex - if (empty($p)) { - return 0; - } - if ($v = function_exists('error_clear_last') && function_exists('error_get_last')) { - error_clear_last(); + if (empty($t) || !is_string($t)) { + return 0; + } + if ($funcsExist = function_exists('error_clear_last') && function_exists('error_get_last')) { + error_clear_last(); + } else { + if ($valTrackErr = ini_get('track_errors')) { + $valMsgErr = isset($php_errormsg) ? $php_errormsg : null; } else { - if ($t = ini_get('track_errors')) { - $o = isset($php_errormsg) ? $php_errormsg : null; - } else { - ini_set('track_errors', 1); - } - unset($php_errormsg); - } - if (($d = ini_get('display_errors'))) { - ini_set('display_errors', 0); - } - preg_match($p, ''); - if ($v) { - $r = null === error_get_last() ? 1 : 0; + ini_set('track_errors', '1'); + } + unset($php_errormsg); + } + if (($valShowErr = ini_get('display_errors'))) { + ini_set('display_errors', '0'); + } + preg_match($t, ''); + if ($funcsExist) { + $out = error_get_last() == null ? 1 : 0; + } else { + $out = isset($php_errormsg) ? 0 : 1; + if ($valTrackErr) { + $php_errormsg = isset($valMsgErr) ? $valMsgErr : null; } else { - $r = isset($php_errormsg) ? 0 : 1; - if ($t) { - $php_errormsg = isset($o) ? $o : null; - } else { - ini_set('track_errors', 0); - } + ini_set('track_errors', '0'); } - if ($d) { - ini_set('display_errors', 1); - } - - return $r; + } + if ($valShowErr) { + ini_set('display_errors', '1'); + } + return $out; } +/** + * Parse $spec htmLawed argument as array. + * + * @param string $t Value of $spec. + * @return array Multidimensional array of form: tag -> attribute -> rule. + */ function hl_spec($t) { - // final $spec - $s = []; - if (!function_exists('hl_aux1')) { - function hl_aux1($m) - { - return substr(str_replace([';', '|', '~', ' ', ',', '/', '(', ')', '`"'], ["\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", '"'], $m[0]), 1, -1); + $out = array(); + + // Hide special characters used for rules. + + if (!function_exists('hl_aux1')) { + function hl_aux1($x) { + return + substr( + str_replace( + array(";", "|", "~", " ", ",", "/", "(", ")", '`"'), + array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", '"'), + $x[0]), + 1, -1); + } + } + $t = + str_replace( + array("\t", "\r", "\n", ' '), + '', + preg_replace_callback('/"(?>(`.|[^"])*)"/sm', 'hl_aux1', trim($t))); + + // Tag, attribute, and rule separators: semi-colon, comma, and slash respectively. + + for ($i = count(($t = explode(';', $t))); --$i>=0;) { + $ele = $t[$i]; + if ( + empty($ele) + || ($tagPos = strpos($ele, '=')) === false + || !strlen(($tagSpec = substr($ele, $tagPos + 1))) + ) { + continue; + } + $ruleAr = $denyAttrAr = array(); + foreach (explode(',', $tagSpec) as $v) { + if (!preg_match('`^(-?data-[^:=]+|[a-z:\-\*]+)(?:\((.*?)\))?`i', $v, $m) + || preg_match('`^-?data-xml`i', $m[1])) { + continue; + } + if (($attr = strtolower($m[1])) == '-*') { + $denyAttrAr['*'] = 1; + continue; + } + if ($attr[0] == '-') { + $denyAttrAr[substr($attr, 1)] = 1; + continue; + } + if (!isset($m[2])) { + $ruleAr[$attr] = 1; + continue; + } + foreach (explode('/', $m[2]) as $m) { + if (empty($m) + || ($rulePos = strpos($m, '=')) === 0 + || $rulePos < 5 // Shortest rule: oneof + ) { + $ruleAr[$attr] = 1; + continue; } + $rule = strtolower(substr($m, 0, $rulePos)); + $ruleAr[$attr][$rule] = + str_replace( + array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08"), + array(";", "|", "~", " ", ",", "/", "(", ")"), + substr($m, $rulePos + 1)); + } + if (isset($ruleAr[$attr]['match']) && !hl_regex($ruleAr[$attr]['match'])) { + unset($ruleAr[$attr]['match']); + } + if (isset($ruleAr[$attr]['nomatch']) && !hl_regex($ruleAr[$attr]['nomatch'])) { + unset($ruleAr[$attr]['nomatch']); + } } - $t = str_replace(["\t", "\r", "\n", ' '], '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', 'hl_aux1', trim($t))); - for ($i = count(($t = explode(';', $t))); --$i >= 0;) { - $w = $t[$i]; - if (empty($w) || ($e = strpos($w, '=')) === false || !strlen(($a = substr($w, $e + 1)))) { - continue; - } - $y = $n = []; - foreach (explode(',', $a) as $v) { - if (!preg_match('`^([a-z:\-\*]+)(?:\((.*?)\))?`i', $v, $m)) { - continue; - } - if (($x = strtolower($m[1])) === '-*') { - $n['*'] = 1; - continue; - } - if ('-' === $x[0]) { - $n[substr($x, 1)] = 1; - continue; - } - if (!isset($m[2])) { - $y[$x] = 1; - continue; - } - foreach (explode('/', $m[2]) as $m) { - if (empty($m) || ($p = strpos($m, '=')) === 0 || $p < 5) { - $y[$x] = 1; - continue; - } - $y[$x][strtolower(substr($m, 0, $p))] = str_replace(["\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08"], [';', '|', '~', ' ', ',', '/', '(', ')'], substr($m, $p + 1)); - } - if (isset($y[$x]['match']) && !hl_regex($y[$x]['match'])) { - unset($y[$x]['match']); - } - if (isset($y[$x]['nomatch']) && !hl_regex($y[$x]['nomatch'])) { - unset($y[$x]['nomatch']); - } - } - if (!count($y) && !count($n)) { - continue; - } - foreach (explode(',', substr($w, 0, $e)) as $v) { - if (!strlen(($v = strtolower($v)))) { - continue; - } - if (count($y)) { - if (!isset($s[$v])) { - $s[$v] = $y; - } else { - $s[$v] = array_merge($s[$v], $y); - } - } - if (count($n)) { - if (!isset($s[$v]['n'])) { - $s[$v]['n'] = $n; - } else { - $s[$v]['n'] = array_merge($s[$v]['n'], $n); - } - } - } + + if (!count($ruleAr) && !count($denyAttrAr)) { + continue; } + foreach (explode(',', substr($ele, 0, $tagPos)) as $tag) { + if (!strlen(($tag = strtolower($tag)))) { + continue; + } + if (count($ruleAr)) { + $out[$tag] = !isset($out[$tag]) ? $ruleAr : array_merge($out[$tag], $ruleAr); + } + if (count($denyAttrAr)) { + $out[$tag]['deny'] = !isset($out[$tag]['deny']) ? $denyAttrAr : array_merge($out[$tag]['deny'], $denyAttrAr); + } + } + } - return $s; + return $out; } +/** + * Handle tag text with limiters, and attributes in opening tags. + * + * @param array $t Array from preg_replace call. + * @return string Tag with any attribute, + * or text with neutralized into entities, or empty. + */ function hl_tag($t) { - // tag/attribute handler - global $C; - $t = $t[0]; - // invalid < > - if ('< ' === $t) { - return '< '; - } - if ('>' === $t) { - return '>'; - } - if (!preg_match('`^<(/?)([a-zA-Z][^\s>]*)([^>]*?)\s?>$`m', $t, $m)) { - return str_replace(['<', '>'], ['<', '>'], $t); - } - $e = strtolower($m[2]); - static $eIC = ['annotation-xml' => 1, 'color-profile' => 1, 'font-face' => 1, 'font-face-src' => 1, 'font-face-uri' => 1, 'font-face-format' => 1, 'font-face-name' => 1, 'missing-glyph' => 1]; // Illegal cust ele - if ((!strpos($e, '-') && !isset($C['elements'][$e])) || (strpos($e, '-') && (isset($C['elements']['-' . $e]) || (!$C['any_custom_element'] && !isset($C['elements'][$e])) || isset($eIC[$e]) || preg_match('`[^-._0-9a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\x{2ff}\x{370}-\x{37d}\x{37f}-\x{1fff}\x{200c}-\x{200d}\x{2070}-\x{218f}\x{2c00}-\x{2fef}\x{3001}-\x{d7ff}\x{f900}-\x{fdcf}\x{fdf0}-\x{fffd}\x{10000}-\x{effff}]`u', $e)))) { - return ($C['keep_bad'] % 2) ? str_replace(['<', '>'], ['<', '>'], $t) : ''; - } - // attr string - $a = str_replace(["\n", "\r", "\t"], ' ', trim($m[3])); - // tag transform - static $eD = ['acronym' => 1, 'applet' => 1, 'big' => 1, 'center' => 1, 'dir' => 1, 'font' => 1, 'isindex' => 1, 's' => 1, 'strike' => 1, 'tt' => 1]; // Deprecated - if ($C['make_tag_strict'] && isset($eD[$e])) { - $trt = hl_tag2($e, $a, $C['make_tag_strict']); - if (!$e) { - return ($C['keep_bad'] % 2) ? str_replace(['<', '>'], ['<', '>'], $t) : ''; - } - } - // close tag - static $eE = ['area' => 1, 'br' => 1, 'col' => 1, 'command' => 1, 'embed' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'isindex' => 1, 'keygen' => 1, 'link' => 1, 'meta' => 1, 'param' => 1, 'source' => 1, 'track' => 1, 'wbr' => 1]; // Empty ele - if (!empty($m[1])) { - return !isset($eE[$e]) ? (empty($C['hook_tag']) ? "" : $C['hook_tag']($e)) : (($C['keep_bad']) % 2 ? str_replace(['<', '>'], ['<', '>'], $t) : ''); - } + $t = $t[0]; + global $C; - // open tag & attr - static $aN = ['abbr' => ['td' => 1, 'th' => 1], 'accept' => ['form' => 1, 'input' => 1], 'accept-charset' => ['form' => 1], 'action' => ['form' => 1], 'align' => ['applet' => 1, 'caption' => 1, 'col' => 1, 'colgroup' => 1, 'div' => 1, 'embed' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'hr' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'legend' => 1, 'object' => 1, 'p' => 1, 'table' => 1, 'tbody' => 1, 'td' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'tr' => 1], 'allowfullscreen' => ['iframe' => 1], 'alt' => ['applet' => 1, 'area' => 1, 'img' => 1, 'input' => 1], 'archive' => ['applet' => 1, 'object' => 1], 'async' => ['script' => 1], 'autocomplete' => ['form' => 1, 'input' => 1], 'autofocus' => ['button' => 1, 'input' => 1, 'keygen' => 1, 'select' => 1, 'textarea' => 1], 'autoplay' => ['audio' => 1, 'video' => 1], 'axis' => ['td' => 1, 'th' => 1], 'bgcolor' => ['embed' => 1, 'table' => 1, 'td' => 1, 'th' => 1, 'tr' => 1], 'border' => ['img' => 1, 'object' => 1, 'table' => 1], 'bordercolor' => ['table' => 1, 'td' => 1, 'tr' => 1], 'cellpadding' => ['table' => 1], 'cellspacing' => ['table' => 1], 'challenge' => ['keygen' => 1], 'char' => ['col' => 1, 'colgroup' => 1, 'tbody' => 1, 'td' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'tr' => 1], 'charoff' => ['col' => 1, 'colgroup' => 1, 'tbody' => 1, 'td' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'tr' => 1], 'charset' => ['a' => 1, 'script' => 1], 'checked' => ['command' => 1, 'input' => 1], 'cite' => ['blockquote' => 1, 'del' => 1, 'ins' => 1, 'q' => 1], 'classid' => ['object' => 1], 'clear' => ['br' => 1], 'code' => ['applet' => 1], 'codebase' => ['applet' => 1, 'object' => 1], 'codetype' => ['object' => 1], 'color' => ['font' => 1], 'cols' => ['textarea' => 1], 'colspan' => ['td' => 1, 'th' => 1], 'compact' => ['dir' => 1, 'dl' => 1, 'menu' => 1, 'ol' => 1, 'ul' => 1], 'content' => ['meta' => 1], 'controls' => ['audio' => 1, 'video' => 1], 'coords' => ['a' => 1, 'area' => 1], 'crossorigin' => ['img' => 1], 'data' => ['object' => 1], 'datetime' => ['del' => 1, 'ins' => 1, 'time' => 1], 'declare' => ['object' => 1], 'default' => ['track' => 1], 'defer' => ['script' => 1], 'dirname' => ['input' => 1, 'textarea' => 1], 'disabled' => ['button' => 1, 'command' => 1, 'fieldset' => 1, 'input' => 1, 'keygen' => 1, 'optgroup' => 1, 'option' => 1, 'select' => 1, 'textarea' => 1], 'download' => ['a' => 1], 'enctype' => ['form' => 1], 'face' => ['font' => 1], 'flashvars' => ['embed' => 1], 'for' => ['label' => 1, 'output' => 1], 'form' => ['button' => 1, 'fieldset' => 1, 'input' => 1, 'keygen' => 1, 'label' => 1, 'object' => 1, 'output' => 1, 'select' => 1, 'textarea' => 1], 'formaction' => ['button' => 1, 'input' => 1], 'formenctype' => ['button' => 1, 'input' => 1], 'formmethod' => ['button' => 1, 'input' => 1], 'formnovalidate' => ['button' => 1, 'input' => 1], 'formtarget' => ['button' => 1, 'input' => 1], 'frame' => ['table' => 1], 'frameborder' => ['iframe' => 1], 'headers' => ['td' => 1, 'th' => 1], 'height' => ['applet' => 1, 'canvas' => 1, 'embed' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'object' => 1, 'td' => 1, 'th' => 1, 'video' => 1], 'high' => ['meter' => 1], 'href' => ['a' => 1, 'area' => 1, 'link' => 1], 'hreflang' => ['a' => 1, 'area' => 1, 'link' => 1], 'hspace' => ['applet' => 1, 'embed' => 1, 'img' => 1, 'object' => 1], 'icon' => ['command' => 1], 'ismap' => ['img' => 1, 'input' => 1], 'keyparams' => ['keygen' => 1], 'keytype' => ['keygen' => 1], 'kind' => ['track' => 1], 'label' => ['command' => 1, 'menu' => 1, 'option' => 1, 'optgroup' => 1, 'track' => 1], 'language' => ['script' => 1], 'list' => ['input' => 1], 'longdesc' => ['img' => 1, 'iframe' => 1], 'loop' => ['audio' => 1, 'video' => 1], 'low' => ['meter' => 1], 'marginheight' => ['iframe' => 1], 'marginwidth' => ['iframe' => 1], 'max' => ['input' => 1, 'meter' => 1, 'progress' => 1], 'maxlength' => ['input' => 1, 'textarea' => 1], 'media' => ['a' => 1, 'area' => 1, 'link' => 1, 'source' => 1, 'style' => 1], 'mediagroup' => ['audio' => 1, 'video' => 1], 'method' => ['form' => 1], 'min' => ['input' => 1, 'meter' => 1], 'model' => ['embed' => 1], 'multiple' => ['input' => 1, 'select' => 1], 'muted' => ['audio' => 1, 'video' => 1], 'name' => ['a' => 1, 'applet' => 1, 'button' => 1, 'embed' => 1, 'fieldset' => 1, 'form' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'keygen' => 1, 'map' => 1, 'object' => 1, 'output' => 1, 'param' => 1, 'select' => 1, 'slot' => 1, 'textarea' => 1], 'nohref' => ['area' => 1], 'noshade' => ['hr' => 1], 'novalidate' => ['form' => 1], 'nowrap' => ['td' => 1, 'th' => 1], 'object' => ['applet' => 1], 'open' => ['details' => 1, 'dialog' => 1], 'optimum' => ['meter' => 1], 'pattern' => ['input' => 1], 'ping' => ['a' => 1, 'area' => 1], 'placeholder' => ['input' => 1, 'textarea' => 1], 'pluginspage' => ['embed' => 1], 'pluginurl' => ['embed' => 1], 'poster' => ['video' => 1], 'pqg' => ['keygen' => 1], 'preload' => ['audio' => 1, 'video' => 1], 'prompt' => ['isindex' => 1], 'pubdate' => ['time' => 1], 'radiogroup' => ['command' => 1], 'readonly' => ['input' => 1, 'textarea' => 1], 'referrerpolicy' => ['a' => 1, 'area' => 1, 'img' => 1, 'iframe' => 1, 'link' => 1], 'rel' => ['a' => 1, 'area' => 1, 'link' => 1], 'required' => ['input' => 1, 'select' => 1, 'textarea' => 1], 'rev' => ['a' => 1], 'reversed' => ['ol' => 1], 'rows' => ['textarea' => 1], 'rowspan' => ['td' => 1, 'th' => 1], 'rules' => ['table' => 1], 'sandbox' => ['iframe' => 1], 'scope' => ['td' => 1, 'th' => 1], 'scoped' => ['style' => 1], 'scrolling' => ['iframe' => 1], 'seamless' => ['iframe' => 1], 'selected' => ['option' => 1], 'shape' => ['a' => 1, 'area' => 1], 'size' => ['font' => 1, 'hr' => 1, 'input' => 1, 'select' => 1], 'sizes' => ['link' => 1], 'span' => ['col' => 1, 'colgroup' => 1], 'src' => ['audio' => 1, 'embed' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'script' => 1, 'source' => 1, 'track' => 1, 'video' => 1], 'srcdoc' => ['iframe' => 1], 'srclang' => ['track' => 1], 'srcset' => ['img' => 1], 'standby' => ['object' => 1], 'start' => ['ol' => 1], 'step' => ['input' => 1], 'summary' => ['table' => 1], 'target' => ['a' => 1, 'area' => 1, 'form' => 1], 'type' => ['a' => 1, 'area' => 1, 'button' => 1, 'command' => 1, 'embed' => 1, 'input' => 1, 'li' => 1, 'link' => 1, 'menu' => 1, 'object' => 1, 'ol' => 1, 'param' => 1, 'script' => 1, 'source' => 1, 'style' => 1, 'ul' => 1], 'typemustmatch' => ['object' => 1], 'usemap' => ['img' => 1, 'input' => 1, 'object' => 1], 'valign' => ['col' => 1, 'colgroup' => 1, 'tbody' => 1, 'td' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'tr' => 1], 'value' => ['button' => 1, 'data' => 1, 'input' => 1, 'li' => 1, 'meter' => 1, 'option' => 1, 'param' => 1, 'progress' => 1], 'valuetype' => ['param' => 1], 'vspace' => ['applet' => 1, 'embed' => 1, 'img' => 1, 'object' => 1], 'width' => ['applet' => 1, 'canvas' => 1, 'col' => 1, 'colgroup' => 1, 'embed' => 1, 'hr' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'object' => 1, 'pre' => 1, 'table' => 1, 'td' => 1, 'th' => 1, 'video' => 1], 'wmode' => ['embed' => 1], 'wrap' => ['textarea' => 1]]; // Ele-specific - static $aNA = ['aria-activedescendant' => 1, 'aria-atomic' => 1, 'aria-autocomplete' => 1, 'aria-braillelabel' => 1, 'aria-brailleroledescription' => 1, 'aria-busy' => 1, 'aria-checked' => 1, 'aria-colcount' => 1, 'aria-colindex' => 1, 'aria-colindextext' => 1, 'aria-colspan' => 1, 'aria-controls' => 1, 'aria-current' => 1, 'aria-describedby' => 1, 'aria-description' => 1, 'aria-details' => 1, 'aria-disabled' => 1, 'aria-dropeffect' => 1, 'aria-errormessage' => 1, 'aria-expanded' => 1, 'aria-flowto' => 1, 'aria-grabbed' => 1, 'aria-haspopup' => 1, 'aria-hidden' => 1, 'aria-invalid' => 1, 'aria-keyshortcuts' => 1, 'aria-label' => 1, 'aria-labelledby' => 1, 'aria-level' => 1, 'aria-live' => 1, 'aria-multiline' => 1, 'aria-multiselectable' => 1, 'aria-orientation' => 1, 'aria-owns' => 1, 'aria-placeholder' => 1, 'aria-posinset' => 1, 'aria-pressed' => 1, 'aria-readonly' => 1, 'aria-relevant' => 1, 'aria-required' => 1, 'aria-roledescription' => 1, 'aria-rowcount' => 1, 'aria-rowindex' => 1, 'aria-rowindextext' => 1, 'aria-rowspan' => 1, 'aria-selected' => 1, 'aria-setsize' => 1, 'aria-sort' => 1, 'aria-valuemax' => 1, 'aria-valuemin' => 1, 'aria-valuenow' => 1, 'aria-valuetext' => 1]; // ARIA - static $aNE = ['allowfullscreen' => 1, 'checkbox' => 1, 'checked' => 1, 'command' => 1, 'compact' => 1, 'declare' => 1, 'defer' => 1, 'default' => 1, 'disabled' => 1, 'hidden' => 1, 'inert' => 1, 'ismap' => 1, 'itemscope' => 1, 'multiple' => 1, 'nohref' => 1, 'noresize' => 1, 'noshade' => 1, 'nowrap' => 1, 'open' => 1, 'radio' => 1, 'readonly' => 1, 'required' => 1, 'reversed' => 1, 'selected' => 1]; // Empty - static $aNO = ['onabort' => 1, 'onblur' => 1, 'oncanplay' => 1, 'oncanplaythrough' => 1, 'onchange' => 1, 'onclick' => 1, 'oncontextmenu' => 1, 'oncopy' => 1, 'oncuechange' => 1, 'oncut' => 1, 'ondblclick' => 1, 'ondrag' => 1, 'ondragend' => 1, 'ondragenter' => 1, 'ondragleave' => 1, 'ondragover' => 1, 'ondragstart' => 1, 'ondrop' => 1, 'ondurationchange' => 1, 'onemptied' => 1, 'onended' => 1, 'onerror' => 1, 'onfocus' => 1, 'onformchange' => 1, 'onforminput' => 1, 'oninput' => 1, 'oninvalid' => 1, 'onkeydown' => 1, 'onkeypress' => 1, 'onkeyup' => 1, 'onload' => 1, 'onloadeddata' => 1, 'onloadedmetadata' => 1, 'onloadstart' => 1, 'onlostpointercapture' => 1, 'onmousedown' => 1, 'onmousemove' => 1, 'onmouseout' => 1, 'onmouseover' => 1, 'onmouseup' => 1, 'onmousewheel' => 1, 'onpaste' => 1, 'onpause' => 1, 'onplay' => 1, 'onplaying' => 1, 'onpointercancel' => 1, 'ongotpointercapture' => 1, 'onpointerdown' => 1, 'onpointerenter' => 1, 'onpointerleave' => 1, 'onpointermove' => 1, 'onpointerout' => 1, 'onpointerover' => 1, 'onpointerup' => 1, 'onprogress' => 1, 'onratechange' => 1, 'onreadystatechange' => 1, 'onreset' => 1, 'onsearch' => 1, 'onscroll' => 1, 'onseeked' => 1, 'onseeking' => 1, 'onselect' => 1, 'onshow' => 1, 'onstalled' => 1, 'onsubmit' => 1, 'onsuspend' => 1, 'ontimeupdate' => 1, 'ontoggle' => 1, 'ontouchcancel' => 1, 'ontouchend' => 1, 'ontouchmove' => 1, 'ontouchstart' => 1, 'onvolumechange' => 1, 'onwaiting' => 1, 'onwheel' => 1, 'onauxclick' => 1, 'oncancel' => 1, 'onclose' => 1, 'oncontextlost' => 1, 'oncontextrestored' => 1, 'onformdata' => 1, 'onmouseenter' => 1, 'onmouseleave' => 1, 'onresize' => 1, 'onsecuritypolicyviolation' => 1, 'onslotchange' => 1]; // Event - static $aNP = ['action' => 1, 'cite' => 1, 'classid' => 1, 'codebase' => 1, 'data' => 1, 'href' => 1, 'itemtype' => 1, 'longdesc' => 1, 'model' => 1, 'pluginspage' => 1, 'pluginurl' => 1, 'src' => 1, 'srcset' => 1, 'usemap' => 1]; // Need scheme check; excludes style, on* - static $aNU = ['accesskey' => 1, 'autocapitalize' => 1, 'autofocus' => 1, 'class' => 1, 'contenteditable' => 1, 'contextmenu' => 1, 'dir' => 1, 'draggable' => 1, 'dropzone' => 1, 'enterkeyhint' => 1, 'hidden' => 1, 'id' => 1, 'inert' => 1, 'inputmode' => 1, 'is' => 1, 'itemid' => 1, 'itemprop' => 1, 'itemref' => 1, 'itemscope' => 1, 'itemtype' => 1, 'lang' => 1, 'nonce' => 1, 'role' => 1, 'slot' => 1, 'spellcheck' => 1, 'style' => 1, 'tabindex' => 1, 'title' => 1, 'translate' => 1, 'xmlns' => 1, 'xml:base' => 1, 'xml:lang' => 1, 'xml:space' => 1]; // Univ; excludes on*, aria* - - if ($C['lc_std_val']) { - // predef attr vals for $eAL & $aNE ele - static $aNL = ['all' => 1, 'auto' => 1, 'baseline' => 1, 'bottom' => 1, 'button' => 1, 'captions' => 1, 'center' => 1, 'chapters' => 1, 'char' => 1, 'checkbox' => 1, 'circle' => 1, 'col' => 1, 'colgroup' => 1, 'color' => 1, 'cols' => 1, 'data' => 1, 'date' => 1, 'datetime' => 1, 'datetime-local' => 1, 'default' => 1, 'descriptions' => 1, 'email' => 1, 'file' => 1, 'get' => 1, 'groups' => 1, 'hidden' => 1, 'image' => 1, 'justify' => 1, 'left' => 1, 'ltr' => 1, 'metadata' => 1, 'middle' => 1, 'month' => 1, 'none' => 1, 'number' => 1, 'object' => 1, 'password' => 1, 'poly' => 1, 'post' => 1, 'preserve' => 1, 'radio' => 1, 'range' => 1, 'rect' => 1, 'ref' => 1, 'reset' => 1, 'right' => 1, 'row' => 1, 'rowgroup' => 1, 'rows' => 1, 'rtl' => 1, 'search' => 1, 'submit' => 1, 'subtitles' => 1, 'tel' => 1, 'text' => 1, 'time' => 1, 'top' => 1, 'url' => 1, 'week' => 1]; - static $eAL = ['a' => 1, 'area' => 1, 'bdo' => 1, 'button' => 1, 'col' => 1, 'fieldset' => 1, 'form' => 1, 'img' => 1, 'input' => 1, 'object' => 1, 'ol' => 1, 'optgroup' => 1, 'option' => 1, 'param' => 1, 'script' => 1, 'select' => 1, 'table' => 1, 'td' => 1, 'textarea' => 1, 'tfoot' => 1, 'th' => 1, 'thead' => 1, 'tr' => 1, 'track' => 1, 'xml:space' => 1]; - $lcase = isset($eAL[$e]) ? 1 : 0; - } + // Check if character not in tag. + + if ($t == '< ') { + return '< '; + } + if ($t == '>') { + return '>'; + } + if (!preg_match('`^<(/?)([a-zA-Z][^\s>]*)([^>]*?)\s?>$`m', $t, $m)) { // Get tag with element name and attributes + return str_replace(array('<', '>'), array('<', '>'), $t); + } + + // Check if element not permitted. Custom element names have certain requirements. + + $ele = strtolower($m[2]); + static $invalidCustomEleAr = array('annotation-xml'=>1, 'color-profile'=>1, 'font-face'=>1, 'font-face-src'=>1, 'font-face-uri'=>1, 'font-face-format'=>1, 'font-face-name'=>1, 'missing-glyph'=>1); + if ( + (!strpos($ele, '-') + && !isset($C['elements'][$ele])) // Not custom element + || (strpos($ele, '-') + && (isset($C['elements']['-' . $ele]) + || (!$C['any_custom_element'] + && !isset($C['elements'][$ele])) + || isset($invalidCustomEleAr[$ele]) + || preg_match( + '`[^-._0-9a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\x{2ff}' + . '\x{370}-\x{37d}\x{37f}-\x{1fff}\x{200c}-\x{200d}\x{2070}-\x{218f}' + . '\x{2c00}-\x{2fef}\x{3001}-\x{d7ff}\x{f900}-\x{fdcf}\x{fdf0}-\x{fffd}\x{10000}-\x{effff}]`u' + , $ele))) + ) { + return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : ''); + } + + // Attribute string. - $depTr = 0; - if ($C['no_deprecated_attr']) { - // depr attr:applicable ele - static $aND = ['align' => ['caption' => 1, 'div' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'legend' => 1, 'object' => 1, 'p' => 1, 'table' => 1], 'bgcolor' => ['table' => 1, 'td' => 1, 'th' => 1, 'tr' => 1], 'border' => ['object' => 1], 'bordercolor' => ['table' => 1, 'td' => 1, 'tr' => 1], 'cellspacing' => ['table' => 1], 'clear' => ['br' => 1], 'compact' => ['dl' => 1, 'ol' => 1, 'ul' => 1], 'height' => ['td' => 1, 'th' => 1], 'hspace' => ['img' => 1, 'object' => 1], 'language' => ['script' => 1], 'name' => ['a' => 1, 'form' => 1, 'iframe' => 1, 'img' => 1, 'map' => 1], 'noshade' => ['hr' => 1], 'nowrap' => ['td' => 1, 'th' => 1], 'size' => ['hr' => 1], 'vspace' => ['img' => 1, 'object' => 1], 'width' => ['hr' => 1, 'pre' => 1, 'table' => 1, 'td' => 1, 'th' => 1]]; - static $eAD = ['a' => 1, 'br' => 1, 'caption' => 1, 'div' => 1, 'dl' => 1, 'form' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'hr' => 1, 'iframe' => 1, 'img' => 1, 'input' => 1, 'legend' => 1, 'map' => 1, 'object' => 1, 'ol' => 1, 'p' => 1, 'pre' => 1, 'script' => 1, 'table' => 1, 'td' => 1, 'th' => 1, 'tr' => 1, 'ul' => 1]; - $depTr = isset($eAD[$e]) ? 1 : 0; + $attrStr = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3])); + + // Transform deprecated element. + + static $deprecatedEleAr = array('acronym'=>1, 'applet'=>1, 'big'=>1, 'center'=>1, 'dir'=>1, 'font'=>1, 'isindex'=>1, 's'=>1, 'strike'=>1, 'tt'=>1); + if ($C['make_tag_strict'] && isset($deprecatedEleAr[$ele])) { + $eleTransformed = hl_deprecatedElement($ele, $attrStr, $C['make_tag_strict']); // hl_deprecatedElement uses referencing + if (!$ele) { + return (($C['keep_bad'] % 2) ? str_replace(array('<', '>'), array('<', '>'), $t) : ''); } + } - // attr name-vals - if (false !== strpos($a, "\x01")) { - $a = preg_replace('`\x01[^\x01]*\x01`', '', $a); - } // No comment/CDATA sec - $mode = 0; - $a = trim($a, ' /'); - $aA = []; - while (strlen($a)) { - $w = 0; - switch ($mode) { - case 0: // Name - if (preg_match('`^[^=\s/\x7f-\x9f]+`', $a, $m)) { - $nm = strtolower($m[0]); - $w = $mode = 1; - $a = ltrim(substr_replace($a, '', 0, strlen($m[0]))); - } - break; - case 1: - if ('=' === $a[0]) { // = - $w = 1; - $mode = 2; - $a = ltrim($a, '= '); - } else { // No val - $w = 1; - $mode = 0; - $a = ltrim($a); - $aA[$nm] = ''; - } - break; - case 2: // Val - if (preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)) { - $a = ltrim($m[2]); - $m = $m[1]; - $w = 1; - $mode = 0; - $aA[$nm] = trim(str_replace('<', '<', ('"' === $m[0] || '\'' === $m[0]) ? substr($m, 1, -1) : $m)); - } - break; - } - if (0 === $w) { // Parse errs, deal with space, " & ' - $a = preg_replace('`^(?:"[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*`', '', $a); - $mode = 0; + // Handle closing tag. + + static $emptyEleAr = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); + if (!empty($m[1])) { + return( + !isset($emptyEleAr[$ele]) + ? (empty($C['hook_tag']) + ? "" + : call_user_func($C['hook_tag'], $ele, 0)) + : ($C['keep_bad'] % 2 + ? str_replace(array('<', '>'), array('<', '>'), $t) + : '')); + } + + // Handle opening tag. + + // -- Sets of possible attributes. + + // .. Element-specific non-global. + + static $attrEleAr = array('abbr'=>array('td'=>1, 'th'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accept-charset'=>array('form'=>1), 'action'=>array('form'=>1), 'align'=>array('applet'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'div'=>1, 'embed'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'async'=>array('script'=>1), 'autocomplete'=>array('form'=>1, 'input'=>1), 'autofocus'=>array('button'=>1, 'input'=>1, 'keygen'=>1, 'select'=>1, 'textarea'=>1), 'autoplay'=>array('audio'=>1, 'video'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1, 'table'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'challenge'=>array('keygen'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('command'=>1, 'input'=>1), 'cite'=>array('blockquote'=>1, 'del'=>1, 'ins'=>1, 'q'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('applet'=>1, 'object'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'content'=>array('meta'=>1), 'controls'=>array('audio'=>1, 'video'=>1), 'coords'=>array('a'=>1, 'area'=>1), 'crossorigin'=>array('img'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1, 'time'=>1), 'declare'=>array('object'=>1), 'default'=>array('track'=>1), 'defer'=>array('script'=>1), 'dirname'=>array('input'=>1, 'textarea'=>1), 'disabled'=>array('button'=>1, 'command'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'download'=>array('a'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1, 'output'=>1), 'form'=>array('button'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'object'=>1, 'output'=>1, 'select'=>1, 'textarea'=>1), 'formaction'=>array('button'=>1, 'input'=>1), 'formenctype'=>array('button'=>1, 'input'=>1), 'formmethod'=>array('button'=>1, 'input'=>1), 'formnovalidate'=>array('button'=>1, 'input'=>1), 'formtarget'=>array('button'=>1, 'input'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('applet'=>1, 'canvas'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'high'=>array('meter'=>1), 'href'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hreflang'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'icon'=>array('command'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'keyparams'=>array('keygen'=>1), 'keytype'=>array('keygen'=>1), 'kind'=>array('track'=>1), 'label'=>array('command'=>1, 'menu'=>1, 'option'=>1, 'optgroup'=>1, 'track'=>1), 'language'=>array('script'=>1), 'list'=>array('input'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'loop'=>array('audio'=>1, 'video'=>1), 'low'=>array('meter'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'max'=>array('input'=>1, 'meter'=>1, 'progress'=>1), 'maxlength'=>array('input'=>1, 'textarea'=>1), 'media'=>array('a'=>1, 'area'=>1, 'link'=>1, 'source'=>1, 'style'=>1), 'mediagroup'=>array('audio'=>1, 'video'=>1), 'method'=>array('form'=>1), 'min'=>array('input'=>1, 'meter'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('input'=>1, 'select'=>1), 'muted'=>array('audio'=>1, 'video'=>1), 'name'=>array('a'=>1, 'applet'=>1, 'button'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'map'=>1, 'object'=>1, 'output'=>1, 'param'=>1, 'select'=>1, 'slot'=>1, 'textarea'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'novalidate'=>array('form'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'open'=>array('details'=>1, 'dialog'=>1), 'optimum'=>array('meter'=>1), 'pattern'=>array('input'=>1), 'ping'=>array('a'=>1, 'area'=>1), 'placeholder'=>array('input'=>1, 'textarea'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'poster'=>array('video'=>1), 'pqg'=>array('keygen'=>1), 'preload'=>array('audio'=>1, 'video'=>1), 'prompt'=>array('isindex'=>1), 'pubdate'=>array('time'=>1), 'radiogroup'=>array('command'=>1), 'readonly'=>array('input'=>1, 'textarea'=>1), 'referrerpolicy' => array('a'=>1,'area'=>1,'img'=>1,'iframe'=>1,'link'=>1), 'rel'=>array('a'=>1, 'area'=>1, 'link'=>1), 'required'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'rev'=>array('a'=>1), 'reversed'=>array('ol'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'sandbox'=>array('iframe'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scoped'=>array('style'=>1), 'scrolling'=>array('iframe'=>1), 'seamless'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('a'=>1, 'area'=>1), 'size'=>array('font'=>1, 'hr'=>1, 'input'=>1, 'select'=>1), 'sizes'=>array('link'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('audio'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'script'=>1, 'source'=>1, 'track'=>1, 'video'=>1), 'srcdoc'=>array('iframe'=>1), 'srclang'=>array('track'=>1), 'srcset'=>array('img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'step'=>array('input'=>1), 'summary'=>array('table'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'area'=>1, 'button'=>1, 'command'=>1, 'embed'=>1, 'input'=>1, 'li'=>1, 'link'=>1, 'menu'=>1, 'object'=>1, 'ol'=>1, 'param'=>1, 'script'=>1, 'source'=>1, 'style'=>1, 'ul'=>1), 'typemustmatch'=>array('object'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('button'=>1, 'data'=>1, 'input'=>1, 'li'=>1, 'meter'=>1, 'option'=>1, 'param'=>1, 'progress'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'width'=>array('applet'=>1, 'canvas'=>1, 'col'=>1, 'colgroup'=>1, 'embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'wmode'=>array('embed'=>1), 'wrap'=>array('textarea'=>1)); + + // .. Empty. + + static $emptyAttrAr = array('allowfullscreen'=>1, 'checkbox'=>1, 'checked'=>1, 'command'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'default'=>1, 'disabled'=>1, 'hidden'=>1, 'inert'=>1, 'ismap'=>1, 'itemscope'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'open'=>1, 'radio'=>1, 'readonly'=>1, 'required'=>1, 'reversed'=>1, 'selected'=>1); + + // .. Global. + + static $globalAttrAr = array( + + // .... General. + + 'accesskey'=>1, 'autocapitalize'=>1, 'autofocus'=>1, 'class'=>1, 'contenteditable'=>1, 'contextmenu'=>1, 'dir'=>1, 'draggable'=>1, 'dropzone'=>1, 'enterkeyhint'=>1, 'hidden'=>1, 'id'=>1, 'inert'=>1, 'inputmode'=>1, 'is'=>1, 'itemid'=>1, 'itemprop'=>1, 'itemref'=>1, 'itemscope'=>1, 'itemtype'=>1, 'lang'=>1, 'nonce'=>1, 'role'=>1, 'slot'=>1, 'spellcheck'=>1, 'style'=>1, 'tabindex'=>1, 'title'=>1, 'translate'=>1, 'xmlns'=>1, 'xml:base'=>1, 'xml:lang'=>1, 'xml:space'=>1, + + // .... Event. + + 'onabort'=>1, 'onauxclick'=>1, 'onblur'=>1, 'oncancel'=>1, 'oncanplay'=>1, 'oncanplaythrough'=>1, 'onchange'=>1, 'onclick'=>1, 'onclose'=>1, 'oncontextlost'=>1, 'oncontextmenu'=>1, 'oncontextrestored'=>1, 'oncopy'=>1, 'oncuechange'=>1, 'oncut'=>1, 'ondblclick'=>1, 'ondrag'=>1, 'ondragend'=>1, 'ondragenter'=>1, 'ondragleave'=>1, 'ondragover'=>1, 'ondragstart'=>1, 'ondrop'=>1, 'ondurationchange'=>1, 'onemptied'=>1, 'onended'=>1, 'onerror'=>1, 'onfocus'=>1, 'onformchange'=>1, 'onformdata'=>1, 'onforminput'=>1, 'ongotpointercapture'=>1, 'oninput'=>1, 'oninvalid'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onload'=>1, 'onloadeddata'=>1, 'onloadedmetadata'=>1, 'onloadend'=>1, 'onloadstart'=>1, 'onlostpointercapture'=>1, 'onmousedown'=>1, 'onmouseenter'=>1, 'onmouseleave'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onmousewheel'=>1, 'onpaste'=>1, 'onpause'=>1, 'onplay'=>1, 'onplaying'=>1, 'onpointercancel'=>1, 'onpointerdown'=>1, 'onpointerenter'=>1, 'onpointerleave'=>1, 'onpointermove'=>1, 'onpointerout'=>1, 'onpointerover'=>1, 'onpointerup'=>1, 'onprogress'=>1, 'onratechange'=>1, 'onreadystatechange'=>1, 'onreset'=>1, 'onresize'=>1, 'onscroll'=>1, 'onsearch'=>1, 'onsecuritypolicyviolation'=>1, 'onseeked'=>1, 'onseeking'=>1, 'onselect'=>1, 'onshow'=>1, 'onslotchange'=>1, 'onstalled'=>1, 'onsubmit'=>1, 'onsuspend'=>1, 'ontimeupdate'=>1, 'ontoggle'=>1, 'ontouchcancel'=>1, 'ontouchend'=>1, 'ontouchmove'=>1, 'ontouchstart'=>1, 'onvolumechange'=>1, 'onwaiting'=>1, 'onwheel'=>1, + + // .... Aria. + + 'aria-activedescendant'=>1, 'aria-atomic'=>1, 'aria-autocomplete'=>1, 'aria-braillelabel'=>1, 'aria-brailleroledescription'=>1, 'aria-busy'=>1, 'aria-checked'=>1, 'aria-colcount'=>1, 'aria-colindex'=>1, 'aria-colindextext'=>1, 'aria-colspan'=>1, 'aria-controls'=>1, 'aria-current'=>1, 'aria-describedby'=>1, 'aria-description'=>1, 'aria-details'=>1, 'aria-disabled'=>1, 'aria-dropeffect'=>1, 'aria-errormessage'=>1, 'aria-expanded'=>1, 'aria-flowto'=>1, 'aria-grabbed'=>1, 'aria-haspopup'=>1, 'aria-hidden'=>1, 'aria-invalid'=>1, 'aria-keyshortcuts'=>1, 'aria-label'=>1, 'aria-labelledby'=>1, 'aria-level'=>1, 'aria-live'=>1, 'aria-multiline'=>1, 'aria-multiselectable'=>1, 'aria-orientation'=>1, 'aria-owns'=>1, 'aria-placeholder'=>1, 'aria-posinset'=>1, 'aria-pressed'=>1, 'aria-readonly'=>1, 'aria-relevant'=>1, 'aria-required'=>1, 'aria-roledescription'=>1, 'aria-rowcount'=>1, 'aria-rowindex'=>1, 'aria-rowindextext'=>1, 'aria-rowspan'=>1, 'aria-selected'=>1, 'aria-setsize'=>1, 'aria-sort'=>1, 'aria-valuemax'=>1, 'aria-valuemin'=>1, 'aria-valuenow'=>1, 'aria-valuetext'=>1); + + static $urlAttrAr = array('action'=>1, 'archive'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'itemtype'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'poster'=>1, 'src'=>1, 'srcset'=>1, 'usemap'=>1); // Excludes style and on* + + // .. Deprecated. + + $alterDeprecAttr = 0; + if ($C['no_deprecated_attr']) { + static $deprecAttrEleAr = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellspacing'=>array('table'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1)); + static $deprecAttrPossibleEleAr = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1); + $alterDeprecAttr = isset($deprecAttrPossibleEleAr[$ele]) ? 1 : 0; + } + + // -- Standard attribute values that may need lowercasing. + + if ($C['lc_std_val']) { + static $lCaseStdAttrValAr = array('all'=>1, 'auto'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'captions'=>1, 'center'=>1, 'chapters'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'color'=>1, 'cols'=>1, 'data'=>1, 'date'=>1, 'datetime'=>1, 'datetime-local'=>1, 'default'=>1, 'descriptions'=>1, 'email'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'metadata'=>1, 'middle'=>1, 'month'=>1, 'none'=>1, 'number'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'range'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'search'=>1, 'submit'=>1, 'subtitles'=>1, 'tel'=>1, 'text'=>1, 'time'=>1, 'top'=>1, 'url'=>1, 'week'=>1); + static $lCaseStdAttrValPossibleEleAr = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'fieldset'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'track'=>1, 'xml:space'=>1); + $lCaseStdAttrVal = isset($lCaseStdAttrValPossibleEleAr[$ele]) ? 1 : 0; + } + + // -- Get attribute name-value pairs. + + if (strpos($attrStr, "\x01") !== false) { // Remove CDATA/comment + $attrStr = preg_replace('`\x01[^\x01]*\x01`', '', $attrStr); + } + $attrStr = trim($attrStr, ' /'); + $attrAr = array(); + $state = 0; + while (strlen($attrStr)) { + $ok = 0; // For parsing errors, to deal with space, ", and ' characters + switch ($state) { + case 0: if (preg_match('`^[^=\s/\x7f-\x9f]+`', $attrStr, $m)) { // Name + $attr = strtolower($m[0]); + $ok = $state = 1; + $attrStr = ltrim(substr_replace($attrStr, '', 0, strlen($m[0]))); + } + break; case 1: if ($attrStr[0] == '=') { + $ok = 1; + $state = 2; + $attrStr = ltrim($attrStr, '= '); + } else { // No value + $ok = 1; + $state = 0; + $attrStr = ltrim($attrStr); + $attrAr[$attr] = ''; + } + break; case 2: if (preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $attrStr, $m)) { // Value + $attrStr = ltrim($m[2]); + $m = $m[1]; + $ok = 1; + $state = 0; + $attrAr[$attr] = + trim( + str_replace('<', '<', + ($m[0] == '"' || $m[0] == '\'') + ? substr($m, 1, -1) + : $m)); + } + break; + } + if (!$ok) { + $attrStr = preg_replace('`^(?:"[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*`', '', $attrStr); + $state = 0; + } + } + if ($state == 1) { + $attrAr[$attr] = ''; + } + + // -- Clean attributes. + + global $S; + $eleSpec = isset($S[$ele]) ? $S[$ele] : array(); + $filtAttrAr = array(); // Finalized attributes + $deniedAttrAr = $C['deny_attribute']; + + foreach ($attrAr as $attr=>$v) { + + // .. Check if attribute is permitted. + + if ( + + // .... Valid attribute. + + ((isset($attrEleAr[$attr][$ele]) + || isset($globalAttrAr[$attr]) + || preg_match('`data-((?!xml)[^:]+$)`', $attr) + || (strpos($ele, '-') + && strpos($attr, 'data-xml') !== 0)) + + // .... No denial through $spec. + + && (empty($eleSpec) + || (!isset($eleSpec['deny']) + || (!isset($eleSpec['deny']['*']) + && !isset($eleSpec['deny'][$attr]) + && !isset($eleSpec['deny'][preg_replace('`^(on|aria|data).+`', '\\1', $attr). '*'])))) + + // .... No denial through $config. + + && (empty($deniedAttrAr) + || (isset($deniedAttrAr['*']) + ? (isset($deniedAttrAr["-$attr"]) + || isset($deniedAttrAr['-'. preg_replace('`^(on|aria|data)..+`', '\\1', $attr). '*'])) + : (!isset($deniedAttrAr[$attr]) + && !isset($deniedAttrAr[preg_replace('`^(on|aria|data).+`', '\\1', $attr). '*']))))) + + // .... Permit if permission through $spec. + + || (!empty($eleSpec) + && (isset($eleSpec[$attr]) + || (isset($globalAttrAr[$attr]) + && isset($eleSpec[preg_replace('`^(on|aria|data).+`', '\\1', $attr). '*'])))) + ) { + + // .. Attribute with no value or standard value. + + if (isset($emptyAttrAr[$attr])) { + $v = $attr; + } elseif ( + !empty($lCaseStdAttrVal) // ! Rather loose but should be ok + && (($ele != 'button' || $ele != 'input') + || $attr == 'type') + ) { + $v = (isset($lCaseStdAttrValAr[($vNew = strtolower($v))])) ? $vNew : $v; + } + + // .. URLs and CSS expressions in style attribute. + + if ($attr == 'style' && !$C['style_pass']) { + if (false !== strpos($v, '&#')) { // Change any entity to character + static $entityAr = array(' '=>' ', ' '=>' ', ':'=>':', ':'=>':', '"'=>'"', '"'=>'"', '('=>'(', '('=>'(', ')'=>')', ')'=>')', '*'=>'*', '*'=>'*', '/'=>'/', '/'=>'/', '\'=>'\\', '\'=>'\\', 'e'=>'e', 'E'=>'e', 'E'=>'e', 'e'=>'e', 'i'=>'i', 'I'=>'i', 'I'=>'i', 'i'=>'i', 'l'=>'l', 'L'=>'l', 'L'=>'l', 'l'=>'l', 'n'=>'n', 'N'=>'n', 'N'=>'n', 'n'=>'n', 'o'=>'o', 'O'=>'o', 'O'=>'o', 'o'=>'o', 'p'=>'p', 'P'=>'p', 'P'=>'p', 'p'=>'p', 'r'=>'r', 'R'=>'r', 'R'=>'r', 'r'=>'r', 's'=>'s', 'S'=>'s', 'S'=>'s', 's'=>'s', 'u'=>'u', 'U'=>'u', 'U'=>'u', 'u'=>'u', 'x'=>'x', 'X'=>'x', 'X'=>'x', 'x'=>'x', '''=>"'", '''=>"'"); + $v = strtr($v, $entityAr); } - } - if (1 === $mode) { - $aA[$nm] = ''; - } + $v = + preg_replace_callback( + '`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', + 'hl_url', + $v); + $v = !$C['css_expression'] + ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) + : $v; - // clean attrs - global $S; - $rl = isset($S[$e]) ? $S[$e] : []; - $a = []; - $nfr = 0; - $d = $C['deny_attribute']; - foreach ($aA as $k => $v) { - if (((isset($d['*']) ? isset($d[$k]) : !isset($d[$k])) && (isset($aN[$k][$e]) || isset($aNU[$k]) || (isset($aNO[$k]) && !isset($d['on*'])) || (isset($aNA[$k]) && !isset($d['aria*'])) || (!isset($d['data*']) && preg_match('`data-((?!xml)[^:]+$)`', $k)) || strpos($e, '-')) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) || isset($rl[$k])) { - if (isset($aNE[$k])) { - $v = $k; - } elseif (!empty($lcase) && (('button' !== $e || 'input' !== $e) || 'type' === $k)) { // Rather loose but ?not cause issues - $v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v; - } - if ('style' === $k && !$C['style_pass']) { - if (false !== strpos($v, '&#')) { - static $sC = [' ' => ' ', ' ' => ' ', 'E' => 'e', 'E' => 'e', 'e' => 'e', 'e' => 'e', 'X' => 'x', 'X' => 'x', 'x' => 'x', 'x' => 'x', 'P' => 'p', 'P' => 'p', 'p' => 'p', 'p' => 'p', 'S' => 's', 'S' => 's', 's' => 's', 's' => 's', 'I' => 'i', 'I' => 'i', 'i' => 'i', 'i' => 'i', 'O' => 'o', 'O' => 'o', 'o' => 'o', 'o' => 'o', 'N' => 'n', 'N' => 'n', 'n' => 'n', 'n' => 'n', 'U' => 'u', 'U' => 'u', 'u' => 'u', 'u' => 'u', 'R' => 'r', 'R' => 'r', 'r' => 'r', 'r' => 'r', 'L' => 'l', 'L' => 'l', 'l' => 'l', 'l' => 'l', '(' => '(', '(' => '(', ')' => ')', ')' => ')', ' ' => ':', ' ' => ':', '"' => '"', '"' => '"', ''' => "'", ''' => "'", '/' => '/', '/' => '/', '*' => '*', '*' => '*', '\' => '\\', '\' => '\\']; - $v = strtr($v, $sC); - } - $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v); - $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; - } elseif (isset($aNP[$k]) || isset($aNO[$k])) { - $v = str_replace('­', ' ', (false !== strpos($v, '&') ? str_replace(['­', '­', '­'], ' ', $v) : $v)); // double-quoted char: soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software - if ('srcset' === $k) { - $v2 = ''; - // Following pattern tries to implement srcset spec - // See https://html.spec.whatwg.org/dev/images.html#srcset-attributes - // See https://html.spec.whatwg.org/#parse-a-srcset-attribute - $pattern = "/(?:\s*(?:[^,\s][^\s]*[^,\s])(?:\s*\S*\s*))(?:,|$)/"; - preg_match_all($pattern, $v, $matches); - $matches = call_user_func_array('array_merge', $matches); - foreach ($matches as $k1 => $v1) { - $v1 = explode(' ', trim($v1, ', '), 2); - $k1 = isset($v1[1]) ? trim($v1[1]) : ''; - if ('' !== $k1 && !preg_match('/(?:\d+(?:\.\d*)?[wx])/', $k1)) { - // We remove candidates with an invalid descriptor - continue; - } - $v1 = trim($v1[0]); - if (isset($v1[0])) { - $v2 .= hl_prot($v1, $k) . (empty($k1) ? '' : ' ' . $k1) . ', '; - } - } - $v = trim($v2, ', '); - } - if ('itemtype' === $k) { - $v2 = ''; - foreach (explode(' ', $v) as $v1) { - if (isset($v1[0])) { - $v2 .= hl_prot($v1, $k) . ' '; - } - } - $v = trim($v2, ' '); - } else { - $v = hl_prot($v, $k); - } - if ('href' === $k) { // X-spam - if ($C['anti_mail_spam'] && 0 === strpos($v, 'mailto:')) { - $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v); - } elseif ($C['anti_link_spam']) { - $r1 = $C['anti_link_spam'][1]; - if (!empty($r1) && preg_match($r1, $v)) { - continue; - } - $r0 = $C['anti_link_spam'][0]; - if (!empty($r0) && preg_match($r0, $v)) { - if (isset($a['rel'])) { - if (!preg_match('`\bnofollow\b`i', $a['rel'])) { - $a['rel'] .= ' nofollow'; - } - } elseif (isset($aA['rel'])) { - if (!preg_match('`\bnofollow\b`i', $aA['rel'])) { - $nfr = 1; - } - } else { - $a['rel'] = 'nofollow'; - } - } - } - } + // .. URLs in other attributes. + + } elseif (isset($urlAttrAr[$attr]) || (isset($globalAttrAr[$attr]) && strpos($attr, 'on') === 0)) { + $v = + str_replace("­", ' ', + (strpos($v, '&') !== false // ! Double-quoted character = soft-hyphen + ? str_replace(array('­', '­', '­'), ' ', $v) + : $v)); + if ($attr == 'srcset' || ($attr == 'archive' && $ele == 'applet')) { + $vNew = ''; + // Following pattern tries to implement srcset spec + // See https://html.spec.whatwg.org/dev/images.html#srcset-attributes + // See https://html.spec.whatwg.org/#parse-a-srcset-attribute + $pattern = "/(?:\s*(?:[^,\s][^\s]*[^,\s])(?:\s*\S*\s*))(?:,|$)/"; + preg_match_all($pattern, $v, $matches); + $matches = call_user_func_array('array_merge', $matches); + foreach ($matches as $k=>$x) { + $x = explode(' ', trim($x, ', '), 2); + $k = isset($x[1]) ? trim($x[1]) : ''; + if ('' !== $k && !preg_match('/(?:\d+(?:\.\d*)?[wx])/', $k)) { + // We remove candidates with an invalid descriptor + continue; } - if (isset($rl[$k]) && is_array($rl[$k]) && ($v = hl_attrval($k, $v, $rl[$k])) === 0) { - continue; + $x = trim($x[0]); + if (isset($x[0])) { + $vNew .= hl_url($x, $attr). (empty($k) ? '' : ' '. $k). ', '; } - $a[$k] = str_replace('"', '"', $v); + } + $v = trim($vNew, ', '); } - } - if ($nfr) { - $a['rel'] = isset($a['rel']) ? $a['rel'] . ' nofollow' : 'nofollow'; - } - - // rqd attr - static $eAR = ['area' => ['alt' => 'area'], 'bdo' => ['dir' => 'ltr'], 'command' => ['label' => ''], 'form' => ['action' => ''], 'img' => ['src' => 'data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==', 'alt' => 'image'], 'map' => ['name' => ''], 'optgroup' => ['label' => ''], 'param' => ['name' => ''], 'style' => ['scoped' => ''], 'textarea' => ['rows' => '10', 'cols' => '50']]; - if (isset($eAR[$e])) { - foreach ($eAR[$e] as $k => $v) { - if (!isset($a[$k])) { - $a[$k] = isset($v[0]) ? $v : $k; + if ($attr == 'itemtype' || ($attr == 'archive' && $ele == 'object')) { + $vNew = ''; + foreach (explode(' ', $v) as $x) { + if (isset($x[0])) { + $vNew .= hl_url($x, $attr). ' '; } + } + $v = trim($vNew, ' '); + } else { + $v = hl_url($v, $attr); } - } - // depr attr - if ($depTr) { - $c = []; - foreach ($a as $k => $v) { - if ('style' === $k || !isset($aND[$k][$e])) { - continue; + // Anti-spam measure. + + if ($attr == 'href') { + if ($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0) { + $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v); + } elseif ($C['anti_link_spam']) { + $x = $C['anti_link_spam'][1]; + if (!empty($x) && preg_match($x, $v)) { + continue; } - $v = str_replace(['\\', ':', ';', '&#'], '', $v); - if ('align' === $k) { - unset($a['align']); - if ('img' === $e && ('left' === $v || 'right' === $v)) { - $c[] = 'float: ' . $v; - } elseif (('div' === $e || 'table' === $e) && 'center' === $v) { - $c[] = 'margin: auto'; - } else { - $c[] = 'text-align: ' . $v; - } - } elseif ('bgcolor' === $k) { - unset($a['bgcolor']); - $c[] = 'background-color: ' . $v; - } elseif ('border' === $k) { - unset($a['border']); - $c[] = "border: {$v}px"; - } elseif ('bordercolor' === $k) { - unset($a['bordercolor']); - $c[] = 'border-color: ' . $v; - } elseif ('cellspacing' === $k) { - unset($a['cellspacing']); - $c[] = "border-spacing: {$v}px"; - } elseif ('clear' === $k) { - unset($a['clear']); - $c[] = 'clear: ' . ('all' !== $v ? $v : 'both'); - } elseif ('compact' === $k) { - unset($a['compact']); - $c[] = 'font-size: 85%'; - } elseif ('height' === $k || 'width' === $k) { - unset($a[$k]); - $c[] = $k . ': ' . (isset($v[0]) && '*' !== $v[0] ? $v . (ctype_digit($v) ? 'px' : '') : 'auto'); - } elseif ('hspace' === $k) { - unset($a['hspace']); - $c[] = "margin-left: {$v}px; margin-right: {$v}px"; - } elseif ('language' === $k && !isset($a['type'])) { - unset($a['language']); - $a['type'] = 'text/' . strtolower($v); - } elseif ('name' === $k) { - if (2 === $C['no_deprecated_attr'] || ('a' !== $e && 'map' !== $e)) { - unset($a['name']); + $x = $C['anti_link_spam'][0]; + if (!empty($x) && preg_match($x, $v)) { + if (isset($filtAttrAr['rel'])) { + if (!preg_match('`\bnofollow\b`i', $filtAttrAr['rel'])) { + $filtAttrAr['rel'] .= ' nofollow'; } - if (!isset($a['id']) && !preg_match('`\W`', $v)) { - $a['id'] = $v; + } elseif (isset($attrAr['rel'])) { + if (!preg_match('`\bnofollow\b`i', $attrAr['rel'])) { + $addNofollow = 1; } - } elseif ('noshade' === $k) { - unset($a['noshade']); - $c[] = 'border-style: none; border: 0; background-color: gray; color: gray'; - } elseif ('nowrap' === $k) { - unset($a['nowrap']); - $c[] = 'white-space: nowrap'; - } elseif ('size' === $k) { - unset($a['size']); - $c[] = 'size: ' . $v . 'px'; - } elseif ('vspace' === $k) { - unset($a['vspace']); - $c[] = "margin-top: {$v}px; margin-bottom: {$v}px"; - } - } - if (count($c)) { - $c = implode('; ', $c); - $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;') . '; ' . $c . ';' : $c . ';'; - } - } - // unique ID - if ($C['unique_ids'] && isset($a['id'])) { - if (preg_match('`\s`', ($id = $a['id'])) || (isset($GLOBALS['hl_Ids'][$id]) && 1 === $C['unique_ids'])) { - unset($a['id']); - } else { - while (isset($GLOBALS['hl_Ids'][$id])) { - $id = $C['unique_ids'] . $id; + } else { + $filtAttrAr['rel'] = 'nofollow'; + } } - $GLOBALS['hl_Ids'][($a['id'] = $id)] = 1; - } - } - // xml:lang - if ($C['xml:lang'] && isset($a['lang'])) { - $a['xml:lang'] = isset($a['xml:lang']) ? $a['xml:lang'] : $a['lang']; - if (2 === $C['xml:lang']) { - unset($a['lang']); - } - } - // for transformed tag - if (!empty($trt)) { - $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;') . '; ' . $trt : $trt; - } - // return with empty ele / - if (empty($C['hook_tag'])) { - $aA = ''; - foreach ($a as $k => $v) { - $aA .= " {$k}=\"{$v}\""; + } } + } - return "<{$e}{$aA}" . (isset($eE[$e]) ? ' /' : '') . '>'; + // .. Check attribute value against any $spec rule. + + if (isset($eleSpec[$attr]) + && is_array($eleSpec[$attr]) + && ($v = hl_attributeValue($attr, $v, $eleSpec[$attr], $ele)) === 0) { + continue; + } + + $filtAttrAr[$attr] = str_replace('"', '"', $v); } + } - return $C['hook_tag']($e, $a); -} + // -- Add nofollow. -function hl_tag2(&$e, &$a, $t = 1) -{ - // transform tag - if ('big' === $e) { - $e = 'span'; + if (isset($addNofollow)) { + $filtAttrAr['rel'] = isset($filtAttrAr['rel']) ? $filtAttrAr['rel']. ' nofollow' : 'nofollow'; + } - return 'font-size: larger;'; - } - if ('s' === $e || 'strike' === $e) { - $e = 'span'; + // -- Add required attributes. - return 'text-decoration: line-through;'; - } - if ('tt' === $e) { - $e = 'code'; + static $requiredAttrAr = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'command'=>array('label'=>''), 'form'=>array('action'=>''), 'img'=>array('src'=>'data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'style'=>array('scoped'=>''), 'textarea'=>array('rows'=>'10', 'cols'=>'50')); + if (isset($requiredAttrAr[$ele])) { + foreach ($requiredAttrAr[$ele] as $k=>$v) { + if (!isset($filtAttrAr[$k])) { + $filtAttrAr[$k] = isset($v[0]) ? $v : $k; + } + } + } - return ''; - } - if ('center' === $e) { - $e = 'div'; + // -- Transform deprecated attributes into CSS declarations in style attribute. - return 'text-align: center;'; - } - static $fs = ['0' => 'xx-small', '1' => 'xx-small', '2' => 'small', '3' => 'medium', '4' => 'large', '5' => 'x-large', '6' => 'xx-large', '7' => '300%', '-1' => 'smaller', '-2' => '60%', '+1' => 'larger', '+2' => '150%', '+3' => '200%', '+4' => '300%']; - if ('font' === $e) { - $a2 = ''; - while (preg_match('`(^|\s)(color|size)\s*=\s*(\'|")?(.+?)(\\3|\s|$)`i', $a, $m)) { - $a = str_replace($m[0], ' ', $a); - $a2 .= 'color' === strtolower($m[2]) ? (' color: ' . str_replace(['"', ';', ':'], '\'', trim($m[4])) . ';') : (isset($fs[($m = trim($m[4]))]) ? (' font-size: ' . $fs[$m] . ';') : ''); + if ($alterDeprecAttr) { + $css = array(); + foreach ($filtAttrAr as $name=>$val) { + if ($name == 'style' || !isset($deprecAttrEleAr[$name][$ele])) { + continue; + } + $val = str_replace(array('\\', ':', ';', '&#'), '', $val); + if ($name == 'align') { + unset($filtAttrAr['align']); + if ($ele == 'img' && ($val == 'left' || $val == 'right')) { + $css[] = 'float: '. $val; + } elseif (($ele == 'div' || $ele == 'table') && $val == 'center') { + $css[] = 'margin: auto'; + } else { + $css[] = 'text-align: '. $val; + } + } elseif ($name == 'bgcolor') { + unset($filtAttrAr['bgcolor']); + $css[] = 'background-color: '. $val; + } elseif ($name == 'border') { + unset($filtAttrAr['border']); + $css[] = "border: {$val}px"; + } elseif ($name == 'bordercolor') { + unset($filtAttrAr['bordercolor']); + $css[] = 'border-color: '. $val; + } elseif ($name == 'cellspacing') { + unset($filtAttrAr['cellspacing']); + $css[] = "border-spacing: {$val}px"; + } elseif ($name == 'clear') { + unset($filtAttrAr['clear']); + $css[] = 'clear: '. ($val != 'all' ? $val : 'both'); + } elseif ($name == 'compact') { + unset($filtAttrAr['compact']); + $css[] = 'font-size: 85%'; + } elseif ($name == 'height' || $name == 'width') { + unset($filtAttrAr[$name]); + $css[] = + $name + . ': ' + . ((isset($val[0]) && $val[0] != '*') + ? $val. (ctype_digit($val) ? 'px' : '') + : 'auto'); + } elseif ($name == 'hspace') { + unset($filtAttrAr['hspace']); + $css[] = "margin-left: {$val}px; margin-right: {$val}px"; + } elseif ($name == 'language' && !isset($filtAttrAr['type'])) { + unset($filtAttrAr['language']); + $filtAttrAr['type'] = 'text/'. strtolower($val); + } elseif ($name == 'name') { + if ($C['no_deprecated_attr'] == 2 || ($ele != 'a' && $ele != 'map')) { + unset($filtAttrAr['name']); } - while (preg_match('`(^|\s)face\s*=\s*(\'|")?([^=]+?)\\2`i', $a, $m) || preg_match('`(^|\s)face\s*=(\s*)(\S+)`i', $a, $m)) { - $a = str_replace($m[0], ' ', $a); - $a2 .= ' font-family: ' . str_replace(['"', ';', ':'], '\'', trim($m[3])) . ';'; + if (!isset($filtAttrAr['id']) && !preg_match('`\W`', $val)) { + $filtAttrAr['id'] = $val; } - $e = 'span'; + } elseif ($name == 'noshade') { + unset($filtAttrAr['noshade']); + $css[] = 'border-style: none; border: 0; background-color: gray; color: gray'; + } elseif ($name == 'nowrap') { + unset($filtAttrAr['nowrap']); + $css[] = 'white-space: nowrap'; + } elseif ($name == 'size') { + unset($filtAttrAr['size']); + $css[] = 'size: '. $val. 'px'; + } elseif ($name == 'vspace') { + unset($filtAttrAr['vspace']); + $css[] = "margin-top: {$val}px; margin-bottom: {$val}px"; + } + } + if (count($css)) { + $css = implode('; ', $css); + $filtAttrAr['style'] = + isset($filtAttrAr['style']) + ? rtrim($filtAttrAr['style'], ' ;'). '; '. $css. ';' + : $css. ';'; + } + } - return ltrim(str_replace('<', '', $a2)); - } - if ('acronym' === $e) { - $e = 'abbr'; + // -- Enforce unique id attribute values. - return ''; + if ($C['unique_ids'] && isset($filtAttrAr['id'])) { + if (preg_match('`\s`', ($id = $filtAttrAr['id'])) || (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)) { + unset($filtAttrAr['id']); + } else { + while (isset($GLOBALS['hl_Ids'][$id])) { + $id = $C['unique_ids']. $id; + } + $GLOBALS['hl_Ids'][($filtAttrAr['id'] = $id)] = 1; } - if ('dir' === $e) { - $e = 'ul'; + } - return ''; - } - if (2 === $t) { - $e = 0; + // -- Handle lang attributes. - return 0; + if ($C['xml:lang'] && isset($filtAttrAr['lang'])) { + $filtAttrAr['xml:lang'] = isset($filtAttrAr['xml:lang']) ? $filtAttrAr['xml:lang'] : $filtAttrAr['lang']; + if ($C['xml:lang'] == 2) { + unset($filtAttrAr['lang']); } + } - return ''; + // -- If transformed element, modify style attribute. + + if (!empty($eleTransformed)) { + $filtAttrAr['style'] = + isset($filtAttrAr['style']) + ? rtrim($filtAttrAr['style'], ' ;'). '; '. $eleTransformed + : $eleTransformed; + } + + // -- Return opening tag with attributes. + + if (empty($C['hook_tag'])) { + $attrStr = ''; + foreach ($filtAttrAr as $k=>$v) { + $attrStr .= " {$k}=\"{$v}\""; + } + return "<{$ele}{$attrStr}". (isset($emptyEleAr[$ele]) ? ' /' : ''). '>'; + } else { + return call_user_func($C['hook_tag'], $ele, $filtAttrAr); + } } -function hl_tidy($t, $w, $p) +/** + * Tidy/beautify HTM by adding newline and other spaces (padding), + * or compact by removing unnecessary spaces. + * + * @param string $t HTM. + * @param mixed $format -1 (compact) or string (type of padding). + * @param string $parentEle Parent element of $t. + * @return mixed Transformed attribute string (may be empty) or 0. + */ +function hl_tidy($t, $format, $parentEle) { - // tidy/compact HTM - if (strpos(' pre,script,textarea', "$p,")) { - return $t; - } - if (!function_exists('hl_aux2')) { - function hl_aux2($m) - { - return $m[1] . str_replace(['<', '>', "\n", "\r", "\t", ' '], ["\x01", "\x02", "\x03", "\x04", "\x05", "\x07"], $m[3]) . $m[4]; + if (strpos(' pre,script,textarea', "$parentEle,")) { + return $t; + } + + // Hide CDATA/comment. + + if (!function_exists('hl_aux2')) { + function hl_aux2($x) { + return + $x[1] + . str_replace( + array("<", ">", "\n", "\r", "\t", ' '), + array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), + $x[3]) + . $x[4]; + } + } + $t = + preg_replace( + array('`(<\w[^>]*(?)\s+`', '`\s+`', '`(<\w[^>]*(?) `'), + array(' $1', ' ', '$1'), + preg_replace_callback( + array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)()`sm'), + 'hl_aux2', + $t)); + + if (($format = strtolower($format)) == -1) { + return + str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); + } + $padChar = strpos(" $format", 't') ? "\t" : ' '; + $padStr = + preg_match('`\d`', $format, $m) + ? str_repeat($padChar, intval($m[0])) + : str_repeat($padChar, ($padChar == "\t" ? 1 : 2)); + $leadN = preg_match('`[ts]([1-9])`', $format, $m) ? intval($m[1]) : 0; + + // Group elements by line-break requirement. + + $postCloseEleAr = array('br'=>1); // After closing + $preEleAr = array('button'=>1, 'command'=>1, 'input'=>1, 'option'=>1, 'param'=>1, 'track'=>1); // Before opening or closing + $preOpenPostCloseEleAr = array('audio'=>1, 'canvas'=>1, 'caption'=>1, 'dd'=>1, 'dt'=>1, 'figcaption'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'style'=>1, 'summary'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1, 'video'=>1); // Before opening and after closing + $prePostEleAr = array('address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'datalist'=>1, 'details'=>1, 'dialog'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'iframe'=>1, 'main'=>1, 'map'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'picture'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'template'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); // Before and after opening and closing + + $doPad = 1; + $t = explode('<', $t); + while ($doPad) { + $n = $leadN; + $eleAr = $t; + ob_start(); + if (isset($prePostEleAr[$parentEle])) { + echo str_repeat($padStr, ++$n); + } + echo ltrim(array_shift($eleAr)); + for ($i=-1, $j=count($eleAr); ++$i<$j;) { + $rest = ''; + list($tag, $rest) = explode('>', $eleAr[$i]); + $open = $tag[0] == '/' ? 0 : (substr($tag, -1) == '/' ? 1 : ($tag[0] != '!' ? 2 : -1)); + $ele = !$open ? ltrim($tag, '/') : ($open > 0 ? substr($tag, 0, strcspn($tag, ' ')) : 0); + $tag = "<$tag>"; + if (isset($prePostEleAr[$ele])) { + if (!$open) { + if ($n) { + echo "\n", str_repeat($padStr, --$n), "$tag\n", str_repeat($padStr, $n); + } else { + ++$leadN; + ob_end_clean(); + continue 2; + } + } else { + echo "\n", str_repeat($padStr, $n), "$tag\n", str_repeat($padStr, ($open != 1 ? ++$n : $n)); } - } - $t = preg_replace(['`(<\w[^>]*(?)\s+`', '`\s+`', '`(<\w[^>]*(?) `'], [' $1', ' ', '$1'], preg_replace_callback(['`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)()`sm'], 'hl_aux2', $t)); - if (($w = strtolower($w)) === -1) { - return str_replace(["\x01", "\x02", "\x03", "\x04", "\x05", "\x07"], ['<', '>', "\n", "\r", "\t", ' '], $t); - } - $s = strpos(" $w", 't') ? "\t" : ' '; - $s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ("\t" === $s ? 1 : 2)); - $N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; - $a = ['br' => 1]; - $b = ['button' => 1, 'command' => 1, 'input' => 1, 'option' => 1, 'param' => 1, 'track' => 1]; - $c = ['audio' => 1, 'canvas' => 1, 'caption' => 1, 'dd' => 1, 'dt' => 1, 'figcaption' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'isindex' => 1, 'label' => 1, 'legend' => 1, 'li' => 1, 'object' => 1, 'p' => 1, 'pre' => 1, 'style' => 1, 'summary' => 1, 'td' => 1, 'textarea' => 1, 'th' => 1, 'video' => 1]; - $d = ['address' => 1, 'article' => 1, 'aside' => 1, 'blockquote' => 1, 'center' => 1, 'colgroup' => 1, 'datalist' => 1, 'details' => 1, 'dialog' => 1, 'dir' => 1, 'div' => 1, 'dl' => 1, 'fieldset' => 1, 'figure' => 1, 'footer' => 1, 'form' => 1, 'header' => 1, 'hgroup' => 1, 'hr' => 1, 'iframe' => 1, 'main' => 1, 'map' => 1, 'menu' => 1, 'nav' => 1, 'noscript' => 1, 'ol' => 1, 'optgroup' => 1, 'picture' => 1, 'rbc' => 1, 'rtc' => 1, 'ruby' => 1, 'script' => 1, 'section' => 1, 'select' => 1, 'table' => 1, 'tbody' => 1, 'template' => 1, 'tfoot' => 1, 'thead' => 1, 'tr' => 1, 'ul' => 1]; - $T = explode('<', $t); - $X = 1; - while ($X) { - $n = $N; - $t = $T; - ob_start(); - if (isset($d[$p])) { - echo str_repeat($s, ++$n); + echo $rest; + continue; + } + $pad = "\n". str_repeat($padStr, $n); + if (isset($preOpenPostCloseEleAr[$ele])) { + if (!$open) { + echo $tag, $pad, $rest; + } else { + echo $pad, $tag, $rest; } - echo ltrim(array_shift($t)); - for ($i = -1, $j = count($t); ++$i < $j;) { - $r = ''; - list($e, $r) = explode('>', $t[$i]); - $x = '/' === $e[0] ? 0 : ('/' === substr($e, -1) ? 1 : ('!' !== $e[0] ? 2 : -1)); - $y = !$x ? ltrim($e, '/') : ($x > 0 ? substr($e, 0, strcspn($e, ' ')) : 0); - $e = "<$e>"; - if (isset($d[$y])) { - if (!$x) { - if ($n) { - echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n); - } else { - ++$N; - ob_end_clean(); - continue 2; - } - } else { - echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, (1 !== $x ? ++$n : $n)); - } - echo $r; - continue; - } - $f = "\n" . str_repeat($s, $n); - if (isset($c[$y])) { - if (!$x) { - echo $e, $f, $r; - } else { - echo $f, $e, $r; - } - } elseif (isset($b[$y])) { - echo $f, $e, $r; - } elseif (isset($a[$y])) { - echo $e, $f, $r; - } elseif (!$y) { - echo $f, $e, $f, $r; - } else { - echo $e, $r; - } + } elseif (isset($preEleAr[$ele])) { + echo $pad, $tag, $rest; + } elseif (isset($postCloseEleAr[$ele])) { + echo $tag, $pad, $rest; + } elseif (!$ele) { + echo $pad, $tag, $pad, $rest; + } else { + echo $tag, $rest; + } + } + $doPad = 0; + } + $t = str_replace(array("\n ", " \n"), "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents())); + ob_end_clean(); + if (($newline = strpos(" $format", 'r') ? (strpos(" $format", 'n') ? "\r\n" : "\r") : 0)) { + $t = str_replace("\n", $newline, $t); + } + return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); +} + +/** + * Handle URL to convert to relative/absolute type, + * block scheme, or add anti-spam text. + * + * @param mixed $url URL string, or array with URL value (if $attr is null). + * @param mixed $attr Attribute name string, or null (if $url is array). + * @return string With URL after any conversion/obfuscation. + */ +function hl_url($url, $attr=null) +{ + global $C; + $preUrl = $postUrl = ''; + static $blocker = 'denied:'; + if ($attr == null) { // style attribute value + $attr = 'style'; + $preUrl = $url[1]; + $postUrl = $url[3]; + $url = trim($url[2]); + } + $okSchemeAr = isset($C['schemes'][$attr]) ? $C['schemes'][$attr] : $C['schemes']['*']; + if (isset($okSchemeAr['!']) && substr($url, 0, 7) != $blocker) { + $url = "{$blocker}{$url}"; + } + if (isset($okSchemeAr['*']) + || !strcspn($url, '#?;') + || substr($url, 0, strlen($blocker)) == $blocker + ) { + return "{$preUrl}{$url}{$postUrl}"; + } + if (preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&(#(58|x3a)|colon);|%3a|\\\\0{0,4}3a).`i', $url, $m) + && !isset($okSchemeAr[strtolower($m[1])]) // Special crafting suggests malice + ) { + return "{$preUrl}{$blocker}{$url}{$postUrl}"; + } + if ($C['abs_url']) { + if ($C['abs_url'] == -1 && strpos($url, $C['base_url']) === 0) { // Make URL relative + $url = substr($url, strlen($C['base_url'])); + } elseif (empty($m[1])) { // Make URL absolute + if (substr($url, 0, 2) == '//') { + $url = substr($C['base_url'], 0, strpos($C['base_url'], ':') + 1). $url; + } elseif ($url[0] == '/') { + $url = preg_replace('`(^.+?://[^/]+)(.*)`', '$1', $C['base_url']). $url; + } elseif (strcspn($url, './')) { + $url = $C['base_url']. $url; + } else { + preg_match('`^([a-zA-Z\d\-+.]+://[^/]+)(.*)`', $C['base_url'], $m); + $url = preg_replace('`(?<=/)\./`', '', $m[2]. $url); + while (preg_match('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', $url)) { + $url = preg_replace('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', '', $url); } - $X = 0; - } - $t = str_replace(["\n ", " \n"], "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents())); - ob_end_clean(); - if (($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)) { - $t = str_replace("\n", $l, $t); + $url = $m[1]. $url; + } } - - return str_replace(["\x01", "\x02", "\x03", "\x04", "\x05", "\x07"], ['<', '>', "\n", "\r", "\t", ' '], $t); + } + return "{$preUrl}{$url}{$postUrl}"; } +/** + * Report version. + * + * @return string Version. + */ function hl_version() { - // version - return '1.2.7'; + return '1.2.11'; } diff --git a/htmLawed_README.htm b/htmLawed_README.htm index cc88e4e..26513df 100644 --- a/htmLawed_README.htm +++ b/htmLawed_README.htm @@ -7,40 +7,40 @@ htmLawed documentation | htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter @@ -75,6 +75,7 @@

htmLawed documentation

    3.3.3  Tag balancing & proper nesting
    3.3.4  Elements requiring child elements
    3.3.5  Beautify or compact HTML
+    3.3.6  Custom elements
  3.4  Attributes
    3.4.1  Auto-addition of XHTML-required attributes
    3.4.2  Duplicate/invalid id values
@@ -111,11 +112,11 @@

htmLawed documentation


-
htmLawed_README.txt, 16 May 2019
-htmLawed 1.2.4.2, 16 May 2019
+
htmLawed_README.txt, 23 January 2023
+htmLawed 1.2.11
Copyright Santosh Patnaik
Dual licensed with LGPL 3 and GPL 2+
-A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed 
+A PHP Labware internal utility - https://bioinformatics.org/phplabware/internal_utilities/htmLawed 

@@ -152,7 +153,7 @@

htmLawed documentation

  htmLawed:

  *  makes input more secure and standard-compliant for HTML as well as generic XML documents  ^
-  *  supports markup for HTML 5 and microdata, ARIA, Ruby, custom attributes, etc.  ^
+  *  supports markup for HTML 5, custom elements, and microdata, ARIA, Ruby, custom attributes, etc.  ^
  *  can beautify or compact HTML  ~
  *  works with input of almost any character encoding and does not affect it
  *  has good tolerance for ill-written HTML
@@ -259,7 +260,7 @@

htmLawed documentation

1.6  Availability (to top)

-  htmLawed can be downloaded for free at its website. Besides the htmLawed.php file, the download has the htmLawed documentation (this document) in plain text and HTML formats, a script for testing, and a text file for test-cases. htmLawed is also available as a PHP class (OOP code) at its website.
+  htmLawed can be downloaded for free at its website. Besides the htmLawed.php file, the download has the htmLawed documentation (this document) in plain text and HTML formats, a script for testing, and a text file for test-cases. htmLawed can be installed with Composer, and is also available as a PHP class (OOP code) – see the website. Official htmLawed releases are also put up on Sourceforge.
@@ -353,6 +354,12 @@

htmLawed documentation

  0 - no measure taken  *
  word - @ in mail address in href attribute value is replaced with specified word

any_custom_element
+  Permit any custom element; regardless of this setting, specific custom elements can be denied or permitted through $config["elements"]; see section 3.3.6
+
0 - no
1 - yes  *
+
  balance
  Balance tags for well-formedness and proper nesting; see section 3.3.3

@@ -409,7 +416,7 @@

htmLawed documentation


  all - *^
  * -acronym -big -center -dir -font -isindex -s -strike -tt -  ~^
applet, audio, canvas, embed, iframe, object, script, and video elements not allowed -  "^
applet, audio, canvas, dialog, embed, iframe, object, script, and video elements not allowed -  "^

  hexdec_entity
  Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section 3.2
@@ -538,7 +545,7 @@

htmLawed documentation


  A rule begins with an HTML element name(s) (rule-element), for which the rule applies, followed by an equal-to (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., th,td,tr=.

-  Rest of the rule consists of comma-separated HTML attribute names. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., -width. To deny all attributes, -* can be used.
+  Rest of the rule consists of comma-separated HTML attribute names, which can be the wildcard references *, aria*, data*, and on* for the sets of all standard, Aria, data-*, and event (on*) attributes, respectively. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., -width. To deny all attributes, -* can be used. All Aria, data-*, and event (on*) attributes can similarly be denined using aria*, data*, and on*, respectively.

  Following shows examples of rule excerpts with rule-element a and the attributes that are being permitted:

@@ -549,8 +556,9 @@

htmLawed documentation

  *  a=-* - none
  *  a=-*, href, title - none except href and title
  *  a=-*, -id, href, title - none except href and title
+  *  a=-on*, -id, href, onclick, title - all except id and on* other than onclick

-  Rules regarding attribute values are optionally specified inside round brackets after attribute names in solidus (/)-separated parameter = value pairs. E.g., title(maxlen=30/minlen=5). None or one or more of the following parameters may be specified:
+  Rules regarding attribute values are optionally specified inside round brackets after attribute names – which cannot be wildcard references like * or data* – in solidus (/)-separated parameter = value pairs. E.g., title(maxlen=30/minlen=5). None or one or more of the following parameters may be specified:

  *  oneof - one or more choices separated by | that the value should match; if only one choice is provided, then the value must match that choice; matching is case-sensitive

@@ -582,13 +590,15 @@

htmLawed documentation


  Special characters: The characters ;, ,, /, (, ), |, ~ and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be escaped by enclosing in pairs of double-quotes ("). A back-tick (`) can be used to escape a literal ". An example rule illustrating this is input=value(maxlen=30/match="/^\w/"/default="your `"ID`"").

Attributes that accept multiple values: If an attribute is accesskey, class, itemtype or rel, which can have multiple, space-separated values, or srcset, which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them.
Attributes that accept multiple values: If an attribute is accesskey, class, itemtype or rel, or archive in case of object element, which can have multiple, space-separated values, or archive in case of object element and srcset, which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them. The parsing is performed after any URL assessment of the attribute values (section 3.4.3).

  Note: To deny an attribute for all elements for which it is legal, $config["deny_attribute"] (see section 3.4) can be used instead of $spec. Also, attributes can be allowed element-specifically through $spec while being denied globally through $config["deny_attribute"]. The hook_tag parameter (section 3.4.9) can also be possibly used to implement a functionality like that achieved using $spec functionality.

Note: Attributes' specifications for an element may be set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence.
Note: Attributes permitted through $spec are permitted regardless of any denial through $config. An attribute for which $spec indicates both permission and denial will be permitted. E.g., onclick with $spec value of a = *, -onclick, onclick, a = -on*, onclick or a = on*, -onclick will be permitted inside a.
+
Note: Attributes' specifications for an element may be (inadvertently) set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence.

$spec can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of $spec will permit the custom uses of the standard rel attribute in input (not permitted as per standards) and of a non-standard attribute, vFlag, in img.
$spec can also be used to permit custom or non-standard attributes. Thus, the following value of $spec will permit the custom uses of the standard rel attribute in input (not permitted as per standards) and of a non-standard attribute, vFlag, in img.

    $spec = 'img=vFlag; input=rel' @@ -612,7 +622,7 @@

htmLawed documentation


  When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially dangerous HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:

-  *  Allowing script, applet, embed, iframe, canvas, audio, video or object elements, or certain of their attributes like allowscriptaccess
+  *  Allowing script, applet, embed, iframe, canvas, audio, video, dialog or object elements, or certain of their attributes like allowscriptaccess

  *  Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <!--[if gte IE 4]><script>alert("xss");</script><![endif]-->

@@ -781,7 +791,7 @@

htmLawed documentation


  It should be borne in mind that no browser application is 100% standard-compliant, standard specifications continue to evolve, and many browsers accept commonly used non-standard HTML. Regarding security, note that unsafe HTML code is not legally invalid per se.

-  *  By default, htmLawed will not strictly adhere to the current HTML standard. Admins can configure htmLawed to be more strict about standard compliance. Standard specification for HTML is continuously evolving. There are two bodies (W3C and WHATWG) that specify the standard and their specifications are not identical. E.g., as in mid-2013, the border attribute is valid in table as per W3C but not WHATWG. Thus, htmLawed may not be fully compliant with the standard of a specific group. The HTML standards/rules that htmLawed uses in its logic are a mix of the W3C and WHATWG standards, and can be lax because of the laxity of HTML interpreters (browsers) regarding standards.
+  *  htmLawed might not strictly adhere to current HTML standards as standard specification for HTML by WHATWG is continuously evolving, and there is laxity among HTML interpreters (browsers) regarding standards. Admins can configure htmLawed to be more strict about standard compliance.

  *  In general, htmLawed processes input to generate output that is most likely to be standard-compatible in most users' browsers. Thus, for example, it does not enforce the required value of 0 on border attribute of img (an HTML version 5 specification).

@@ -791,9 +801,9 @@

htmLawed documentation


  *  By default, htmLawed won't check many attribute values for standard compliance. E.g., width="20m" with the dimension in non-standard m is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the hook_tag parameter (section 3.4.9) or $spec to enforce finer checks on attribute values.

-  *  By default, htmLawed considers all ARIA, data-*, event and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the itemtype microdata attribute is permitted only in elements that also have the itemscope attribute. Admins can configure htmLawed to be more strict about this (section 2.3).
+  *  By default, htmLawed considers all ARIA, data-*, event, and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the itemtype microdata attribute is permitted only in elements that also have the itemscope attribute. Admins can configure htmLawed to be more strict about this (section 2.3).

-  *  The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, $spec can be used to allow custom attributes (section 2.3).
+  *  The attributes, whether deprecated (which can be transformed by htmLawed) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, $spec can be used to allow custom attributes (section 2.3).

  *  Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the hook_tag parameter (section 3.4.9) or $spec for finer checks. Perhaps the best option is to disallow style but allow class attributes with the right oneof or match values for class, and have the various class style properties in .css CSS stylesheet files.

@@ -817,7 +827,7 @@

htmLawed documentation


  *  htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., <a href="http://x%22+style=%22background-image:xss">x</a>). These arise when browsers mis-identify markup in escaped text, defeating the very purpose of escaping text (a bad browser will read the given example as <a href="http://x" style="background-image:xss">x</a>).

-  *  Because of poor Unicode support in PHP, htmLawed does not remove the high value HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section 3.1).
+  *  Because of inadequate Unicode support in PHP, htmLawed does not remove the high value HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section 3.1).

  *  htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML meta tags, this can permit an exploit (like Google's UTF-7/XSS vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.

@@ -974,7 +984,7 @@

htmLawed documentation


  Valid character entities take the form &*; where * is #x followed by a hexadecimal number (hexadecimal numeric entity; like &#xA0; for non-breaking space), or alphanumeric like gt (external or named entity; like &nbsp; for non-breaking space), or # followed by a number (decimal numeric entity; like &#160; for non-breaking space). Character entities referring to the soft-hyphen character (the &shy; or \xad character; hexadecimal code-point ad [decimal 173]) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.

-  htmLawed (function hl_ent()):
+  htmLawed (function hl_entity()):

  *  Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous)

@@ -984,7 +994,7 @@

htmLawed documentation


  *  Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, 7f to 84, 86 to 9f, and fdd0 to fddf, or decimally, 127 to 132, 134 to 159, and 64991 to 64976). Entities referring to the remaining discouraged characters (see section 5.1 for a full list) are let through.

-  *  Neutralizes named entities that are not in the specifications
+  *  Neutralizes named entities that are not in the HTML5 specification

  *  Optionally converts valid HTML-specific named entities except &gt;, &lt;, &quot;, and &amp; to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is 2) for generic XML-compliance. For this, $config["named_entity"] should be 1.

@@ -1033,17 +1043,25 @@

htmLawed documentation


  See section 3.3.3 for differences between the various non-zero $config["keep_bad"] values.

-  htmLawed by default permits these 118 HTML elements:
+  htmLawed by default permits these 122 HTML elements:

-    a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr +    a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dialog, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, picture, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, slot, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, template, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr +
+
+  htmLawed also supports use of custom HTML elements, but this support can be turned off when $config is appropriately set (i.e., in default configuration, such elements are permitted); see section 3.3.6.
+
+  Elements math and svg are not supported. They and their content will get filtered unless a strategy like in section 3.9 is used.
+
+  Elements like acronym, applet, basefont, bgsound, big, blink, center, command, dir, font, hgroup, image, keygen, marquee, menuitem, nobr, noembed, rb, rtc, shadow, spacer, strike, tt, and xmp are currently obsolete/deprecated. Some of them, like acronym and keygen, are supported in htmLawed (see above list). Tag transformation is possible for improving compliance with HTML standards -- most, but not all, of the obsolete/deprecated elements are converted to valid  ones; see section 3.3.2.

+  These 16 htmLawed-supported elements are empty elements that have an opening tag with possible content but no element content (thus, no closing tag): area, br, col, command, embed, hr, img, input, isindex, keygen, link, meta, param, source, track, and wbr.

-  The HTML version 4 elements acronym, applet, big, center, dir, font, strike, and tt are obsolete/deprecated in HTML version 5. On the other hand, the obsolete/deprecated HTML 4 elements embed, menu and u are no longer so in HTML 5. Elements new to HTML 5 are article, aside, audio, bdi, canvas, command, data, datalist, details, figure, figcaption, footer, header, hgroup, keygen, link, main, mark, meta, meter, nav, output, progress, section, source, style, summary, time, track, video, and wbr. The link, meta and style elements exist in HTML 4 but are not allowed in the HTML body. These 16 elements are empty elements that have an opening tag with possible content but no element content (thus, no closing tag): area, br, col, command, embed, hr, img, input, isindex, keygen, link, meta, param, source, track, and wbr.
+  As per standards, closing tags are optional for these elements under certain conditions: caption, colgroup, dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, and tr. By default, htmLawed will add a missing closing tag for such elements, unless balancing (section 3.3.3) is turned off.

-  With $config["safe"] = 1, the default set will exclude applet, audio, canvas, embed, iframe, object, script and video; see section 3.6.
+  With $config["safe"] = 1, the default set of htmLawed-supported elements will exclude applet, audio, canvas, dialog, embed, iframe, object, script and video; see section 3.6.

-  When $config["elements"], which specifies allowed elements, is properly defined, and neither empty nor set to 0 or *, the default set is not used. To have elements added to or removed from the default set, a +/- notation is used. E.g., *-script-object implies that only script and object are disallowed, whereas *+embed means that noembed is also allowed. Elements can also be specified as comma separated names. E.g., a, b, i means only a, b and i are permitted. In this notation, *, + and - have no significance and can actually cause a mis-reading.
+  When $config["elements"], which specifies allowed elements, is properly defined, and neither empty nor set to 0 or *, the default set is not used. To have elements added to or removed from the default set, a +/- notation is used. E.g., *-script-object implies that only script and object are disallowed, whereas *+noembed means that noembed is also allowed. For an element with a hyphen in name, use round brackets around the name; e.g., (my-custom-element). Elements can also be specified as comma separated names. E.g., a, b, i means only a, b and i are permitted. In this notation, *, + and - have no significance and can actually cause a mis-reading.

  Some more examples of $config["elements"] values indicating permitted elements (note that empty spaces are liberally allowed for clarity):

@@ -1051,6 +1069,7 @@

htmLawed documentation

  *  *-script -- all excluding script
  *  * -acronym -big -center -dir -font -isindex -s -strike -tt -- only non-obsolete/deprecated elements of HTML5
  *  *+noembed-script -- all including noembed excluding script
+  *  *+noembed+(my-custom-element) -- all including noembed and my-custom-element

  Some mis-usages (and the resulting permitted elements) that can be avoided:

@@ -1064,11 +1083,9 @@

htmLawed documentation


  Basically, when using the +/- notation, commas (,) should not be used, and vice versa, and * should be used with the former but not the latter.

Note: Even if an element that is not in the default set is allowed through $config["elements"], like noembed in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ($config["balance"] set to 0). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function hl_bal() to accommodate the element and its nesting properties.
Note: Even if an element that is not in the default set is allowed through $config["elements"], like noembed in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ($config["balance"] set to 0). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function hl_balance() to accommodate the element and its nesting properties.

-  A possible second way to specify allowed elements is to set $config["parent"] to an element name that supposedly will hold the input, and to set $config["balance"] to 1. During tag balancing (see section 3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to div if $config["parent"] is empty, body, or an element not in htmLawed's default set of 118 elements.
-
Tag transformation is possible for improving compliance with HTML standards -- most of the obsolete/deprecated elements of HTML version 5 are converted to valid  ones; see section 3.3.2.
+  A possible second way to specify allowed elements is to set $config["parent"] to an element name that supposedly will hold the input, and to set $config["balance"] to 1. During tag balancing (see section 3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to div if $config["parent"] is empty, body, or an element not in htmLawed's default set of 122 elements.

3.3.1  Handling of comments & CDATA sections @@ -1076,7 +1093,7 @@

htmLawed documentation


  CDATA sections have the format <![CDATA[...anything but not "]]>"...]]>, and HTML comments, <!--...anything but not "-->"... -->. Neither HTML comments nor CDATA sections can reside inside tags. HTML comments can exist anywhere else, but CDATA sections can exist only where plain text is allowed (e.g., immediately inside td element content but not immediately inside tr element content).

-  htmLawed (function hl_cmtcd()) handles HTML comments or CDATA sections depending on the values of $config["comment"] or $config["cdata"]. If 0, such markup is not looked for and the text is processed like plain text. If 1, it is removed completely. If 2, it is preserved but any <, > and & inside are changed to entities. If 3 for $config["cdata"], or 3 or 4 for $config["comment"], they are left as such. When $config["comment"] is set to 4, htmLawed will not force a space character before the --> comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook).
+  htmLawed (function hl_commentCdata()) handles HTML comments or CDATA sections depending on the values of $config["comment"] or $config["cdata"]. If 0, such markup is not looked for and the text is processed like plain text. If 1, it is removed completely. If 2, it is preserved but any <, > and & inside are changed to entities. If 3 for $config["cdata"], or 3 or 4 for $config["comment"], they are left as such. When $config["comment"] is set to 4, htmLawed will not force a space character before the --> comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook).

  Note that for the last two cases, HTML comments and CDATA sections will always be removed from tag content (function hl_tag()).

@@ -1120,14 +1137,14 @@

htmLawed documentation

3.3.2  Tag-transformation for better compliance with standards (to top)

-  If $config["make_tag_strict"] is set and not 0, following deprecated elements (and attributes), as per HTML 5 specification, even if admin-permitted, are mutated as indicated (element content remains intact; function hl_tag2()):
+  If $config["make_tag_strict"] is set and not 0, following deprecated elements (and attributes), even if admin-permitted, are mutated as indicated (element content remains intact; function hl_deprecatedElement()):

  *  acronym - abbr
  *  applet - based on $config["make_tag_strict"], unchanged (1) or removed (2)
  *  big - span style="font-size: larger;"
  *  center - div style="text-align: center;"
  *  dir - ul
-  *  font (face, size, color) -    span style="font-family: ; font-size: ; color: ;" (size transformation reference)
+  *  font (face, size, color) - span style="font-family: ; font-size: ; color: ;" (size transformation reference)
  *  isindex - based on $config["make_tag_strict"], unchanged (1) or removed (2)
  *  s - span style="text-decoration: line-through;"
  *  strike - span style="text-decoration: line-through;"
@@ -1153,7 +1170,7 @@

htmLawed documentation

    <div style="text-align: center;">
-     The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-family: arial; color: red; font-size: 200%;">htmLawedTest.php</span>, from <span style="color:green; text-decoration: underline;">PHP Labware</span>. +     The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-size: 200%; color: red; font-family: arial;">htmLawedTest.php</span>, from <u style="color:green">PHP Labware</u>.
    </div> @@ -1164,7 +1181,7 @@

htmLawed documentation

3.3.3  Tag balancing & proper nesting (to top)

-  If $config["balance"] is set to 1, htmLawed (function hl_bal()) checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).
+  If $config["balance"] is set to 1, htmLawed (function hl_balance()) checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).

  Depending on the value of $config["keep_bad"] (see section 2.2 and section 3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities:

@@ -1258,7 +1275,7 @@

htmLawed documentation


  Note: In the example above, unlike <*>, <xml> gets considered as a tag (even though there is no HTML element named xml). Thus, the keep_bad parameter's value affects <xml> but not <*>. In general, text matching the regular expression pattern <(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?> is considered a tag (phrase enclosed by the angled brackets < and >, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the keep_bad value.

-  Nesting/content rules for each of the 118 elements in htmLawed's default set (see section 3.3) are defined in function hl_bal(). This means that if a non-standard element besides embed is being permitted through $config["elements"], the element's tag content will end up getting removed if $config["balance"] is set to 1.
+  Nesting/content rules for each of the 122 standard elements in htmLawed's default set (see section 3.3) are defined in function hl_balance(). Any custom element (section 3.3.6) is permitted to be within and to contain any other element.

  Plain text and/or certain elements nested inside blockquote, form, map and noscript need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as form, the input B:<input type="text" value="b" />C:<input type="text" value="c" /> is converted to <div>B:<input type="text" value="b" />C:<input type="text" value="c" /></div>.
@@ -1288,34 +1305,57 @@

htmLawed documentation


  As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside pre elements) are all considered equivalent, and referred to as white-spaces. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space normalization allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such pretty HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome.

-  With the $config parameter tidy, htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides pre, the script and textarea elements, CDATA sections, and HTML comments are not subjected to the tidying process.
+  With the $config parameter tidy, htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides pre, the script, and textarea elements, CDATA sections, and HTML comments are not subjected to the tidying process.
+
+  Any custom HTML element (section 3.3.6) is treated like an inline element, like strong, during tidying.

  To compact, use $config["tidy"] = -1; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed.

  To beautify, $config["tidy"] is set as 1, or for customized tidying, as a string like 2s2n. The s or t character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The r and n characters are used to specify line-break characters: n for \n (Unix/Mac OS X line-breaks), rn or nr for \r\n (Windows/DOS line-breaks), or r for \r.

+  For instance, with $config["tidy"] set as 3s2n, 3 space characters are used per indentation level, the entire block of text (HTML code) gets a lead (left spacing) of 2 space characters, and line-breaks are with \n character.
+
  The $config["tidy"] value of 1 is equivalent to 2s0n. Other $config["tidy"] values are read loosely: a value of 4 is equivalent to 4s0n; t2, to 1t2n; s, to 2s0n; 2TR, to 2t0r; T1, to 1t1n; nr3, to 3s0nr, and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification.

  Input formatting using $config["tidy"] is not recommended when input text has mixed markup (like HTML + PHP).
+
+

+3.3.6  Custom HTML elements +

(to top)
+
+  Custom elements are HTML elements whose properties/behaviors are defined by the author, instead of being universal (i.e., defined by the HTML interpreter like a browser). Their names must begin with a lowercased a-z character, contain at least one hyphen (-), and cannot be: annotation-xml, color-profile, font-face, font-face-src, font-face-uri, font-face-format, font-face-name, missing-glyph. A huge variety of characters is permitted in the name.
+
+ +    0-9 | . | _ | #xB7 | #xC0-#xD6 | #xD8-#xF6 | #xF8-#x37D | #x37F-#x1FFF | #x200C-#x200D | #x203F-#x2040 | #x2070-#x218F | #x2C00-#x2FEF | #x3001-#xD7FF | #xF900-#xFDCF | #xFDF0-#xFFFD | [#x10000-#xEFFFF] +
+
+  With $config["any_custom_element"] set to 0, no custom element is permitted, whereas with a value of 1 (default value), any such element is permitted. Regardless of the setting, specific custom elements can be denied or permitted through $config["elements"] (see section 3.3.1).
+
+  Any custom HTML element is treated like an inline element, like strong, during tidying (section 3.3.5). During tag balancing (section 3.3.3), any custom element is permitted to be within and to contain any other element. These laxities are necessitated because, by definition, custom elements are parochial.
+
+  Custom elements are permitted to have attributes of any name consisting of any character except a few such as equal, forward slash, and most control characters (unless denied through $spec) and satisfying any data attribute name requirement.
+

3.4  Attributes

(to top)

-  In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in section 5.2. Using the $spec argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section 2.3).
+ +    In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in section:- #5.2. Using the '$spec' argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section:- #2.3). +

  Custom data-* (data-star) attributes, where the first three characters of the value of star (*) after lower-casing do not equal xml, and the value of star does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like aria-live, onclick and itemid are also considered global attributes (section 5.2).

-  When $config["deny_attribute"] is not set, or set to 0, or empty (""), all attributes are permitted. Otherwise, $config["deny_attribute"] can be set as a list of comma-separated names of the denied attributes. on* can be used to refer to the group of potentially dangerous, script-accepting event attributes like onblur and onchange that have on at the beginning of their names. Similarly, aria* and data* can be used to respectively refer to the set of all ARIA and data-* attributes.
+  When $config["deny_attribute"] is not set, or set to 0, or empty (""), all attributes are permitted as per standards. Otherwise, $config["deny_attribute"] can be set in two different ways. One way is as a list of comma-separated names of the denied attributes. on* can be used to refer to the group of potentially dangerous, script-accepting event attributes like onchange that have on at the beginning of their names. Similarly, aria* and data* can be used to respectively refer to the set of all ARIA and data-* attributes. The second way to set $config["deny_attribute"] permits the denying of all but a few attributes globally. The notation is * -attribute1 -attribute2 .... Thus, a value of * -title -href implies that except href and title (where allowed as per standards) all other attributes are to be removed. Terms aria* data*, and on* can be used in this notation, and a whitespace character is necessary before the - character.

-  With $config["safe"] = 1 (section 3.6), the on* event attributes are automatically disallowed even if a value for $config["deny_attribute"] has been manually provided.
+  With $config["safe"] = 1 (section 3.6), any on* event attribute is disallowed even if $config["deny_attribute"] is set otherwise (such as * -style -on*).

-  Note that attributes specified in $config["deny_attribute"] are denied globally, for all elements. To deny attributes for only specific elements, $spec (see section 2.3) can be used. $spec can also be used to element-specifically permit an attribute otherwise denied through $config["deny_attribute"].
+  The attribute restrictions specified with $config["deny_attribute"] apply to all elements. To deny attributes for only specific elements, $spec (see section 2.3) can be used. $spec can also be used to element-specifically permit an attribute otherwise denied through $config["deny_attribute"].

-  Finer restrictions on attributes can also be put into effect through $config["deny_attribute"] (section).
+  Finer restrictions on attributes can also be put into effect through $config["hook_tag"] (section 3.4.9).

Note: To deny all but a few attributes globally, a simpler way to specify $config["deny_attribute"] would be to use the notation * -attribute1 -attribute2 .... Thus, a value of * -title -href implies that except href and title (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter safe (section 3.6) will have no effect on deny_attribute. Values of aria* data*, and on* cannot be used in this notation to refer to the sets of all ARIA, data-*, and on* attributes respectively.
+  Custom elements are permitted to have attributes of any name consisting of any character except a few such as equal, forward slash, and most control characters (unless denied through $spec) and satisfying any data attribute name requirement.

  htmLawed (function hl_tag()) also:

@@ -1323,6 +1363,7 @@

htmLawed documentation

  *  Removes duplicate attributes (last one stays)
  *  Gives attributes the form name="value" and single-spaces them, removing unnecessary white-spacing
  *  Provides required attributes (see section 3.4.1)
+  *  Optionally lowercases certain standard attribute values (see section 3.4.5)
  *  Double-quotes values and escapes any " inside them
  *  Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point ad) in the values with spaces
  *  Allows custom function to additionally filter/modify attribute values (see section 3.4.9)
@@ -1380,11 +1421,13 @@

htmLawed documentation

  Also, only data, file, http, https and javascript are permitted in these attributes that accept URLs:

-    action, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, src, srcset, style, usemap, and event attributes like onclick +    action, archive, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, poster, src, srcset, style, usemap, and event attributes like onclick

  With $config["safe"] = 1 (section 3.6), the above is changed to disallow app, data and javascript.

Note: URLs in data-* attribute values are not checked, but $spec (section 2.3) or $config["hook_tag"] (section 3.4.9) can be used for this purpose.
+
  These default sets are used when $config["schemes"] is not set (see section 2.2). To over-ride the defaults, $config["schemes"] is defined as a string of semi-colon-separated sub-strings of type attribute: comma-separated schemes. E.g., href: mailto, http, https; onclick: javascript; src: http, https. For unspecified attributes, data, file, http, https and javascript are permitted. This can be changed by passing schemes for * in $config["schemes"]. E.g., href: mailto, http, https; *: https, https.

  * (asterisk) can be put in the list of schemes to permit all protocols. E.g., style: *; img: http, https results in protocols not being checked in style attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section 3.4.4) is not done. When an attribute is explicitly listed in $config["schemes"], then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute.
@@ -1395,8 +1438,6 @@

htmLawed documentation


  ! can be put in the list of schemes to disallow all protocols as well as local URLs. Thus, with href: http, style: !, <a href="http://cnn.com" style="background-image: url(local.jpg);">CNN</a> will become <a href="http://cnn.com" style="background-image: url(denied:local.jpg);">CNN</a>

Note: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string src (e.g., dynsrc) or starts with o (e.g., onbeforecopy).
-
  With $config["safe"] = 1, all URLs are disallowed in the style attribute values.
@@ -1606,11 +1647,9 @@

htmLawed documentation

3.4.9  Hook function for tag content (to top)

-  It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).
+  It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.). The function should have two arguments, the first receiving an element name and the second receiving either 0 (in case of a closing tag) or an array of attribute name-value pairs (opening tag). It should return a string  with full HTM markup, either an opening or a closing tag with element name and any string of attributes.

-  When $config parameter hook_tag is set to the name of a function, htmLawed (function hl_tag()) will pass on the element name, and the finalized attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like <element_name attribute_1_name="attribute_1_value"...> (for empty elements like img and input, the element-closing slash / should also be included), etc.
-
-  Any hook_tag function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as a in the closing </a> tag of the element <a href="http://cnn.com">CNN</a>. No other value is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like </a>).
+  When $config parameter hook_tag is set to the name of a function or class method, htmLawed (function hl_tag()) will pass on the element name, and the finalized attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like <element_name attribute_1_name="attribute_1_value"...> (for empty elements like img and input, the element-closing slash / should also be included), etc.

  This is a powerful functionality that can be exploited for various objectives: consolidate-and-convert inline style attributes to class, convert embed elements to object, permit only one caption element in a table element, disallow embedding of certain types of media, inject HTML, use CSSTidy to sanitize style attribute values, etc.

@@ -1705,7 +1744,7 @@

htmLawed documentation


  The hook_tag parameter is different from the hook parameter (section 3.7).

-  Snippets of hook function code developed by others may be available on the htmLawed website.
+  Snippets of hook function code developed by others may be available on the htmLawed website.

@@ -1733,13 +1772,13 @@

htmLawed documentation

    deny_attribute - on*
-    elements - * -applet -audio -canvas -embed -iframe -object -script -video +    elements - * -applet -audio -canvas -dialog -embed -iframe -object -script -video
    schemes - href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style: !; *:file, http, https

-  With safe set to 1, htmLawed considers CDATA sections and HTML comments as plain text, and prohibits the applet, audio, canvas, embed, iframe, object, script and video elements, and the on* attributes like onclick. ( There are $config parameters like css_expression that are not affected by the value set for safe but whose default values still contribute towards a more safe output.) Further, unless overridden by the value for parameter schemes (see section 3.4.3), the schemes app, data and javascript are not permitted, and URLs with schemes are neutralized so that, e.g., style="moz-binding:url(http://danger)" becomes style="moz-binding:url(denied:http://danger)".
+  With safe set to 1, htmLawed considers CDATA sections and HTML comments as plain text, and prohibits the applet, audio, canvas, dialog, embed, iframe, object, script and video elements, and the on* attributes like onclick. ( There are $config parameters like css_expression that are not affected by the value set for safe but whose default values still contribute towards a more safe output.) Further, unless overridden by the value for parameter schemes (see section 3.4.3), the schemes app, data and javascript are not permitted, and URLs with schemes are neutralized so that, e.g., style="moz-binding:url(http://danger)" becomes style="moz-binding:url(denied:http://danger)".

  Admins, however, may still want to completely deny the style attribute, e.g., with code like

@@ -1751,27 +1790,27 @@

htmLawed documentation


  If a value for a parameter auto-set through safe is still manually provided, then that value can over-ride the auto-set value. E.g., with $config["safe"] = 1 and $config["elements"] = "* +script", script, but not applet, is allowed. Such over-ride does not occur for deny_attribute (for legacy reason) when comma-separated attribute names are provided as the value for this parameter (section 3.4); instead htmLawed will add on* to the value provided for deny_attribute.

-  A page illustrating the efficacy of htmLawed's anti-XSS abilities with safe set to 1 against XSS vectors listed by RSnake may be available here.
+  A page illustrating the efficacy of htmLawed's anti-XSS abilities with safe set to 1 against XSS vectors listed by RSnake may be available here.

3.7  Using a hook function

(to top)

-  If $config["hook"] is not set to 0, then htmLawed will allow preliminarily processed input to be altered by a hook function named by $config["hook"] before starting the main work (but after handling of characters, entities, HTML comments and CDATA sections -- see code for function htmLawed()).
+  If $config["hook"] is not set to 0, then htmLawed will allow preliminarily processed input to be altered by a function or class method named by $config["hook"] before starting the main work (but after handling of characters, entities, HTML comments and CDATA sections -- see code for function htmLawed()). The function should have three arguments – the processed input string, and the finalized $config and $spec arrays, in order – and it should return the string after any manipulation.

  The hook function also allows one to alter the finalized values of $config and $spec.

  Note that the hook parameter is different from the hook_tag parameter (section 3.4.9).

-  Snippets of hook function code developed by others may be available on the htmLawed website.
+  Snippets of hook function code developed by others may be available on the htmLawed website.

3.8  Obtaining finalized parameter values

(to top)

-  htmLawed can assign the finalized $config and $spec values to a variable named by $config["show_setting"]. The variable, made global by htmLawed, is set as an array with three keys: config, with the $config value, spec, with the $spec value, and time, with a value that is the Unix time (the output of PHP's microtime() function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.
+  htmLawed can assign the finalized $config and $spec values to a variable named by $config["show_setting"]. The variable, made global by htmLawed, is set as an array with four keys: config, with the $config value, spec, with the $spec value, time, with a value that is the Unix time (the output of PHP's microtime function) when htmLawed completed filtering, and version, with htmLawed version. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.

  The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers.
@@ -1809,7 +1848,7 @@

htmLawed documentation

4.1  Support (to top)

-  Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net.
+  Software updates and forum-based community-support may be found at https://bioinformatics.org/phplabware/internal_utilities/htmLawed.

@@ -1827,6 +1866,20 @@

htmLawed documentation


  Version number - Release date. Notes

+  1.2.11 - 23 January 2023. Fixes an XSS vulnerability arising from a lack of inspection for the alphabetical HTML entity for colon character in URLs
+
+  1.2.10 - 5 November 2022. Class methods can now be specified as $config hook and hook_tag functions; corrects a PHP notice if $config["schemes"] mistakenly lacks colons.
+
+  1.2.9 - 2 July 2022. Improves parsing of $config["deny_attribute"] to permit spaces flanking comma characters and allow references to sets of all ARIA, data-* and event attributes; fixes parsing of $spec for data-* attribute rules; now permits use of aria*, data*, and on* in $spec; now covers all named HTML entities of current standard specification (this increased htmLawed code size by ~40%); recognizes that closing tag may be omitted for caption, optgroup, rp, rt, and tbody as well; recognizes that archive and poster attribute values can have URLs, which can be multiple; recognizes onloadend as global attribute; renames some internal functions; improved standards-compliance for element nesting.
+
+  1.2.8 - 6 June 2022. Fixes incorrect formatting of HTML comments when $config["comment"] = 4; fixes misreading of entity-fied colon characters in style attribute values; $config["show_setting"] now includes htmLawed version; improved PHP 8.2 code compatibility, and readability
+
+  1.2.7 - 10 April 2022. Support for elements dialog, picture, slot, and template; support for custom HTML elements; support for global attributes autocapitalize, autofocus, enterkeyhint, inputmode, is, and nonce; support for 17 additional ARIA and 11 additional on* event handler attributes; support for attributes with names not beginning with a-z; fix for a minor bug arising during deprecated height/weight attribute transformation
+
+  1.2.6 - 4 September 2021. Fixes a bug that arises when $config["deny_attribute"] has a data-* attribute with > 1 hyphen character
+
+  1.2.5 - 24 September 2019. Fixes two bugs in font tag transformation
+
  1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in $config["schemes"]

  1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4
@@ -1911,7 +1964,7 @@

htmLawed documentation


  1.0.2 - 13 February 2008. Improved implementation of $config["keep_bad"]

-  1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions (hl_tag() and hl_prot()); no error display with hl_regex()
+  1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions; no error display during regex testing

  1.0 - 2 November 2007. First release
@@ -1937,14 +1990,14 @@

htmLawed documentation


  (3) From version older than 1.2 to later, if htmLawed is used without $config["safe"] set to 1: Unlike previous versions, htmLawed version 1.2 and later permit data and javascript URL schemes by default (see section 3.4.3).

-  Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip; for 1.1.1, http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip; and for 1.1.22, http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip.
+  Old versions of htmLawed may be available online. E.g., for version 1.0, check https://bioinformatics.org/phplabware/downloads/htmLawed1.zip; for 1.1.1, https://bioinformatics.org/phplabware/downloads/htmLawed111.zip; and for 1.1.22, https://bioinformatics.org/phplabware/downloads/htmLawed1122.zip.

4.6  Comparison with HTMLPurifier

(to top)

-  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2015):
+  The HTMLPurifier PHP library by Edward Yang is a good HTML filtering script that uses object-oriented PHP code. Compared to htmLawed, as of year 2015, HTMLPurifier:

  *  does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)

@@ -1952,27 +2005,29 @@

htmLawed documentation


  *  consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory)

-  *  is expectedly slower
+  *  is expectedly considerably slower

  *  lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification)

  *  has poor documentation

-  However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website for updated information.
+  *  may have finer checks for character encodings and attribute values
+
+  *  can log warnings and errors

4.7  Use through application plug-ins/modules

(to top)

-  Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed forum.
+  Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed forum.

4.8  Use in non-PHP applications

(to top)

-  Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site.
+  Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site.

@@ -1986,7 +2041,7 @@

htmLawed documentation

4.10  Acknowledgements (to top)

-  Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Alexandre Chouinard, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, Edward Yang, and many anonymous users.
+  Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Michael Butler, Dac Chartrand, Alexandre Chouinard, NinCollin, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Hideki Mitsuda, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, walrusmoose, Edward Yang, and many others.

  Thank you!
@@ -2008,11 +2063,11 @@

htmLawed documentation

5.2  Valid attribute-element combinations (to top)

-  *  includes deprecated attributes (marked ^), attributes for microdata (marked *), the non-standard bordercolor, and new-in-HTML5 attributes (marked ~); can have multiple comma-separated values (marked %); can have multiple space-separated values (marked $)
+  *  includes deprecated attributes (marked ^), attributes for microdata (marked *), some non-standard attributes for embed (marked **), and the non-standard bordercolor; can have multiple comma-separated values (marked %); can have multiple space-separated values (marked $)
  *  only non-frameset, HTML body elements
  *  name for a and map, and lang are invalid in XHTML 1.1
-  *  target is valid for a in XHTML 1.1 and higher
  *  xml:space is only for XHTML 1.1
+  *  excludes data-* and author-specified, non-standard attributes of custom elements

  abbr - td, th
  accept - form, input
@@ -2022,17 +2077,17 @@

htmLawed documentation

  allowfullscreen - iframe
  alt - applet, area, img, input
  archive - applet, object
-  async~ - script
-  autocomplete~ - input
-  autofocus~ - button, input, keygen, select, textarea
-  autoplay~ - audio, video
+  async - script
+  autocomplete - input
+  autofocus - button, input, keygen, select, textarea
+  autoplay - audio, video
  axis - td, th
  bgcolor - embed, table^, td^, th^, tr^
  border - img, object^, table
  bordercolor - table, td, tr
  cellpadding - table
  cellspacing - table
-  challenge~ - keygen
+  challenge - keygen
  char - col, colgroup, tbody, td, tfoot, th, thead, tr
  charoff - col, colgroup, tbody, td, tfoot, th, thead, tr
  charset - a, script
@@ -2048,94 +2103,94 @@

htmLawed documentation

  colspan - td, th
  compact - dir, dl^, menu, ol^, ul^
  content - meta
-  controls~ - audio, video
+  controls - audio, video
  coords - area, a
-  crossorigin~ - img
+  crossorigin - img
  data - object
  datetime - del, ins, time
  declare - object
-  default~ - track
+  default - track
  defer - script
  dir - bdo
-  dirname~ - input, textarea
+  dirname - input, textarea
  disabled - button, command, fieldset, input, keygen, optgroup, option, select, textarea
-  download~ - a
+  download - a
  enctype - form
  face - font
  flashvars** - embed
  for - label, output
-  form~ - button, fieldset, input, keygen, label, object, output, select, textarea
-  formaction~ - button, input
-  formenctype~ - button, input
-  formmethod~ - button, input
-  formnovalidate~ - button, input
-  formtarget~ - button, input
+  form - button, fieldset, input, keygen, label, object, output, select, textarea
+  formaction - button, input
+  formenctype - button, input
+  formmethod - button, input
+  formnovalidate - button, input
+  formtarget - button, input
  frame - table
  frameborder - iframe
  headers - td, th
  height - applet, canvas, embed, iframe, img, input, object, td^, th^, video
-  high~ - meter
+  high - meter
  href - a, area, link
  hreflang - a, area, link
  hspace - applet, embed, img^, object^
-  icon~ - command
+  icon - command
  ismap - img, input
-  keytype~ - keygen
-  keyparams~ - keygen
-  kind~ - track
+  keytype - keygen
+  keyparams - keygen
+  kind - track
  label - command, menu, option, optgroup, track
  language - script^
-  list~ - input
+  list - input
  longdesc - img, iframe
-  loop~ - audio, video
-  low~ - meter
+  loop - audio, video
+  low - meter
  marginheight - iframe
  marginwidth - iframe
-  max~ - input, meter, progress
+  max - input, meter, progress
  maxlength - input, textarea
-  media~ - a, area, link, source, style
-  mediagroup~ - audio, video
+  media - a, area, link, source, style
+  mediagroup - audio, video
  method - form
-  min~ - input, meter
+  min - input, meter
  model** - embed
  multiple - input, select
-  muted~ - audio, video
-  name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, textarea
+  muted - audio, video
+  name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, slot, textarea
  nohref - area
  noshade - hr^
-  novalidate~ - form
+  novalidate - form
  nowrap - td^, th^
  object - applet
-  open~ - details
-  optimum~ - meter
-  pattern~ - input
-  ping~ - a, area
-  placeholder~ - input, textarea
+  open - details, dialog
+  optimum - meter
+  pattern - input
+  ping - a, area
+  placeholder - input, textarea
  pluginspage** - embed
  pluginurl** - embed
-  poster~ - video
-  pqg~ - keygen
-  preload~ - audio, video
+  poster - video
+  pqg - keygen
+  preload - audio, video
  prompt - isindex
-  pubdate~ - time
+  pubdate - time
  radiogroup* - command
  readonly - input, textarea
-  required~ - input, select, textarea
+  required - input, select, textarea
  rel$ - a, area, link
  rev - a
-  reversed~ - old
+  reversed - old
  rows - textarea
  rowspan - td, th
  rules - table
-  sandbox~ - iframe
+  sandbox - iframe
  scope - td, th
-  scoped~ - style
+  scoped - style
  scrolling - iframe
-  seamless~ - iframe
+  seamless - iframe
  selected - option
  shape - area, a
  size - font, hr^, input, select
-  sizes~ - link
+  sizes - link
  span - col, colgroup
  src - audio, embed, iframe, img, input, script, source, track, video
  srcdoc~ - iframe
@@ -2159,7 +2214,7 @@

htmLawed documentation


  The following attributes, including event-specific ones and attributes of ARIA and microdata specifications, are considered global and allowed in all elements:

-  accesskey, aria-activedescendant, aria-atomic, aria-autocomplete, aria-busy, aria-checked, aria-controls, aria-describedby, aria-disabled, aria-dropeffect, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class$, contenteditable, contextmenu, dir, draggable, dropzone, hidden, id, inert, itemid, itemprop, itemref, itemscope, itemtype, lang, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, role, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space
+  accesskey, autocapitalize, autofocus, aria-activedescendant, aria-atomic, aria-autocomplete, aria-braillelabel, aria-brailleroledescription, aria-busy, aria-checked, aria-colcount, aria-colindex, aria-colindextext, aria-colspan, aria-controls, aria-current, aria-describedby, aria-description, aria-details, aria-disabled, aria-dropeffect, aria-errormessage, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-keyshortcuts, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-placeholder, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-roledescription, aria-rowcount, aria-rowindex, aria-rowindextext, aria-rowspan, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class, contenteditable, contextmenu, dir, draggable, dropzone, enterkeyhint, hidden, id, inert, inputmode, is, itemid, itemprop, itemref, itemscope, itemtype, lang, nonce, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadend, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, onauxclick, oncancel, onclose, oncontextlost, oncontextrestored, onformdata, onmouseenter, onmouseleave, onresize, onsecuritypolicyviolation, onslotchange, role, slot, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space

  Custom data-* attributes, where the first three characters of the value of star (*) after lower-casing do not equal xml and the value of star does not have a colon (:), equal-to (=), newline, solidus (/), space, tab, or any A-Z character, are also considered global and allowed in all elements.
@@ -2257,22 +2312,22 @@

htmLawed documentation


  Except for the main htmLawed() function, htmLawed's functions are name-spaced using the hl_ prefix. The functions and their roles are:

-  *  hl_attrval - check attribute values against $spec
-  *  hl_bal - balance tags and ensure proper nesting
-  *  hl_cmtcd - handle CDATA sections and HTML comments
-  *  hl_ent - handle character entities
-  *  hl_prot - check a URL scheme/protocol
+  *  hl_attributeValue - check attribute values against $spec rules
+  *  hl_balance - balance tags and ensure proper nesting
+  *  hl_commentCdata - handle CDATA sections and HTML comments
+  *  hl_deprecatedElement - transform element tags
+  *  hl_entity - handle character entities
  *  hl_regex - check syntax of a regular expression
-  *  hl_spec - convert user-supplied $spec value to one used internally
+  *  hl_spec - convert $spec value to one used internally
  *  hl_tag - handle element tags and attributes
-  *  hl_tag2 - transform element tags
  *  hl_tidy - compact/beautify HTML
+  *  hl_url - check URL-containing values
  *  hl_version - report htmLawed version
  *  htmLawed - main function

htmLawed() finalizes $spec (with the help of hl_spec()) and $config, and globalizes them. Finalization of $config involves setting default values if an inappropriate or invalid one is supplied. This includes calling hl_regex() to check well-formedness of regular expression patterns if such expressions are user-supplied through $config. htmLawed() then removes invalid characters like nulls and x01 and appropriately handles entities using hl_ent(). HTML comments and CDATA sections are identified and treated as per $config with the help of hl_cmtcd(). When retained, the < and > characters identifying them, and the <, > and & characters inside them, are replaced with control characters (code-points 1 to 5) till any tag balancing is completed.
htmLawed() finalizes $spec (with the help of hl_spec()) and $config, and globalizes them. Finalization of $config involves setting default values if an inappropriate or invalid one is supplied. This includes calling hl_regex() to check well-formedness of regular expression patterns if such expressions are user-supplied through $config. htmLawed() then removes invalid characters like nulls and x01 and appropriately handles entities using hl_entity(). HTML comments and CDATA sections are identified and treated as per $config with the help of hl_commentCdata(). When retained, the < and > characters identifying them, and the <, > and & characters inside them, are replaced with control characters (code-points 1 to 5) till any tag balancing is completed.

-  After this initial processing htmLawed() identifies tags using regex and processes them with the help of hl_tag() --  a large function that analyzes tag content, filtering it as per HTML standards, $config and $spec. Among other things, hl_tag() transforms deprecated elements using hl_tag2(), removes attributes from closing tags, checks attribute values as per $spec rules using hl_attrval(), and checks URL protocols using hl_prot(). htmLawed() performs tag balancing and nesting checks with a call to hl_bal(), and optionally compacts/beautifies the output with proper white-spacing with a call to hl_tidy(). The latter temporarily replaces white-space, and <, > and & characters inside pre, script and textarea elements, and HTML comments and CDATA sections with control characters (code-points 1 to 5, and 7).
+  After this initial processing htmLawed() identifies tags using regex and processes them with the help of hl_tag() --  a large function that analyzes tag content, filtering it as per HTML standards, $config and $spec. Among other things, hl_tag() transforms deprecated elements using hl_deprecatedElement(), removes attributes from closing tags, checks attribute values as per $spec rules using hl_attributeValue(), and checks URL protocols using hl_url(). htmLawed() performs tag balancing and nesting checks with a call to hl_balance(), and optionally compacts/beautifies the output with proper white-spacing with a call to hl_tidy(). The latter temporarily replaces white-space, and <, > and & characters inside pre, script and textarea elements, and HTML comments and CDATA sections with control characters (code-points 1 to 5, and 7).

  htmLawed permits the use of custom code or hook functions at two stages. The first, called inside htmLawed(), allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section 3.7). The second is called by hl_tag() once the tag content is finalized (see section 3.4.9).

@@ -2280,8 +2335,8 @@

htmLawed documentation


-


HTM version of htmLawed_README.txt generated on 16 May, 2019 using rTxt2htm from PHP Labware +


HTM version of htmLawed_README.txt generated on 23 Jan, 2023 using rTxt2htm from PHP Labware - + \ No newline at end of file diff --git a/htmLawed_README.txt b/htmLawed_README.txt index 88df4d3..b35e471 100755 --- a/htmLawed_README.txt +++ b/htmLawed_README.txt @@ -1,9 +1,9 @@ /* -htmLawed_README.txt, 16 May 2019 -htmLawed 1.2.4.2, 16 May 2019 +htmLawed_README.txt, 23 January 2023 +htmLawed 1.2.11 Copyright Santosh Patnaik Dual licensed with LGPL 3 and GPL 2+ -A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed +A PHP Labware internal utility - https://bioinformatics.org/phplabware/internal_utilities/htmLawed */ @@ -36,6 +36,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 3.3.3 Tag balancing & proper nesting 3.3.4 Elements requiring child elements 3.3.5 Beautify or compact HTML + 3.3.6 Custom elements 3.4 Attributes 3.4.1 Auto-addition of XHTML-required attributes 3.4.2 Duplicate/invalid 'id' values @@ -75,7 +76,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and with administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that introduces a security vulnerability or is used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML code ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators. - + htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file. It does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy:- http://tidy.sourceforge.net application. @@ -85,13 +86,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Filtering of text submitted as comments on blogs to allow only certain HTML elements * Making RSS newsfeed items standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant - + * Beautifying or pretty-printing HTML code * Text processing for stricter XML standard-compliance: e.g., to have lowercased 'x' in hexadecimal numeric entities becomes necessary if an HTML document with MathML content needs to be served as 'application/xml' * Scraping text from web-pages - + * Transforming an HTML element to another @@ -99,11 +100,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Key: '*' security feature, '^' standard compliance, '~' requires setting right options - + htmLawed: - + * makes input more *secure* and *standard-compliant* for HTML as well as generic *XML* documents ^ - * supports markup for *HTML 5* and *microdata, ARIA, Ruby, custom attributes*, etc. ^ + * supports markup for *HTML 5*, *custom elements*, and *microdata, ARIA, Ruby, custom attributes*, etc. ^ * can *beautify* or *compact* HTML ~ * works with input of almost any *character encoding* and does not affect it * has good *tolerance for ill-written HTML* @@ -163,7 +164,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern htmLawed was created in 2007 for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like 'HTML Tidy'. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the 'Kses' (version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses 'Kses'; see section:- #2.6.). Support for HTML version 5 was added in May 2013 in a beta and in February 2017 in a production release. - + See section:- #4.3 for a detailed log of changes in htmLawed over the years, and section:- #4.10 for acknowledgements. @@ -206,14 +207,14 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 1.6 Availability -----------------------------------------------o - htmLawed can be downloaded for free at its website:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. Besides the 'htmLawed.php' file, the download has the htmLawed documentation (this document) in plain text:- htmLawed_README.txt and HTML:- htmLawed_README.htm formats, a script for testing:- htmLawedTest.php, and a text file for test-cases:- htmLawed_TESTCASE.txt. htmLawed is also available as a PHP class (OOP code) at its website. + htmLawed can be downloaded for free at its website:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed. Besides the 'htmLawed.php' file, the download has the htmLawed documentation (this document) in plain text:- htmLawed_README.txt and HTML:- htmLawed_README.htm formats, a script for testing:- htmLawedTest.php, and a text file for test-cases:- htmLawed_TESTCASE.txt. htmLawed can be installed with Composer, and is also available as a PHP class (OOP code) – see the website:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed. Official htmLawed releases are also put up on Sourceforge:- https://sourceforge.net/projects/htmlawed/. == 2 Usage =======================================================oo htmLawed works in PHP version 4.4 or higher. Either 'include()' the 'htmLawed.php' file, or copy-paste the entire code. - + To use with PHP 4.3, have the following code included: if(!function_exists('ctype_digit')){ @@ -229,16 +230,16 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern The input text to be processed, '$text', is passed as an argument of type string; 'htmLawed()' returns the processed string: $processed = htmLawed($text); - + With the 'htmLawed class' (section:- #1.6), usage is: - + $processed = htmLawed::hl($text); *Notes*: (1) If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes'. By default, htmLawed will process the text allowing all valid HTML elements/tags and commonly used URL schemes and CSS style properties. It will allow Javascript code, 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec': - $processed = htmLawed($text, $config, $spec); + $processed = htmLawed($text, $config, $spec); The '$config' and '$spec' arguments are detailed below. Some examples are shown in section:- #2.9. For maximum protection against 'XSS' and other security vulnerabilities, consider using the 'safe' parameter; see section:- #3.6. @@ -267,7 +268,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *anti_link_spam* Anti-link-spam measure; see section:- #3.4.7 - + '0' - no measure taken * `array("regex1", "regex2")` - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more. @@ -277,6 +278,12 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern '0' - no measure taken * `word` - '@' in mail address in 'href' attribute value is replaced with specified `word` + *any_custom_element* + Permit any custom element; regardless of this setting, specific custom elements can be denied or permitted through '$config["elements"]'; see section:- #3.3.6 + + '0' - no + '1' - yes * + *balance* Balance tags for well-formedness and proper nesting; see section:- #3.3.3 @@ -321,7 +328,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern '0' - none * `string` - dictated by values in `string` 'on*' - on* event attributes like 'onfocus' not allowed " - + *direct_nest_list* Allow direct nesting of a list within another without requiring it to be a list item; see section:- #3.3.4 @@ -333,7 +340,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern `all` - *^ '* -acronym -big -center -dir -font -isindex -s -strike -tt' - ~^ - `applet, audio, canvas, embed, iframe, object, script, and video elements not allowed` - "^ + `applet, audio, canvas, dialog, embed, iframe, object, script, and video elements not allowed` - "^ *hexdec_entity* Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section:- #3.2 @@ -357,7 +364,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *keep_bad* Neutralize `bad` tags by converting their '<' and '>' characters to entities, or remove them; see section:- #3.3.3 - '0' - remove + '0' - remove '1' - neutralize both tags and element content '2' - remove tags but neutralize element content '3' and '4' - like '1' and '2' but remove if text ('pcdata') is invalid in parent element @@ -372,7 +379,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *make_tag_strict* Transform or remove these deprecated HTML elements, even if they are allowed by the admin: acronym, applet, big, center, dir, font, isindex, s, strike, tt; see section:- #3.3.2 - '0' - no + '0' - no '1' - yes, but leave 'applet' and 'isindex' that currently cannot be transformed *^ '2' - yes, removing 'applet' and 'isindex' elements and their contents (nested elements remain) ~^ @@ -385,7 +392,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *no_deprecated_attr* Allow deprecated attributes or transform them; see section:- #3.4.6 - '0' - allow + '0' - allow '1' - transform, but 'name' attributes for 'a' and 'map' are retained * '2' - transform @@ -423,7 +430,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *unique_ids* 'id' attribute value checks; see section:- #3.4.2 - '0' - no + '0' - no '1' - remove duplicate and/or invalid ones * `word` - remove invalid ones and replace duplicate ones with new and unique ones based on the `word`; the admin-specified `word` cannot contain a space character @@ -455,7 +462,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern A rule begins with an HTML *element* name(s) (`rule-element`), for which the rule applies, followed by an equal-to (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., 'th,td,tr='. - Rest of the rule consists of comma-separated HTML *attribute names*. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., '-width'. To deny all attributes, '-*' can be used. + Rest of the rule consists of comma-separated HTML *attribute names*, which can be the wildcard references '*', 'aria*', 'data*', and 'on*' for the sets of all standard, Aria, data-*, and event (on*) attributes, respectively. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., '-width'. To deny all attributes, '-*' can be used. All Aria, data-*, and event (on*) attributes can similarly be denined using 'aria*', 'data*', and 'on*', respectively. Following shows examples of rule excerpts with rule-element 'a' and the attributes that are being permitted: @@ -466,8 +473,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * 'a=-*' - none * 'a=-*, href, title' - none except 'href' and 'title' * 'a=-*, -id, href, title' - none except 'href' and 'title' + * 'a=-on*, -id, href, onclick, title' - all except 'id' and on* other than 'onclick' - Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in solidus (/)-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None or one or more of the following parameters may be specified: + Rules regarding *attribute values* are optionally specified inside round brackets after attribute names – which cannot be wildcard references like '*' or 'data*' – in solidus (/)-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None or one or more of the following parameters may be specified: * 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice; matching is case-sensitive @@ -499,14 +507,16 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'. - *Attributes that accept multiple values*: If an attribute is 'accesskey', 'class', 'itemtype' or 'rel', which can have multiple, space-separated values, or 'srcset', which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them. - + *Attributes that accept multiple values*: If an attribute is 'accesskey', 'class', 'itemtype' or 'rel', or 'archive' in case of 'object' element, which can have multiple, space-separated values, or 'archive' in case of 'object' element and 'srcset', which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them. The parsing is performed after any URL assessment of the attribute values (section:- #3.4.3). + *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be possibly used to implement a functionality like that achieved using '$spec' functionality. + + *Note*: Attributes permitted through '$spec' are permitted regardless of any denial through '$config'. An attribute for which $spec indicates both permission and denial will be permitted. E.g., 'onclick' with '$spec' value of 'a = *, -onclick, onclick', 'a = -on*, onclick' or 'a = on*, -onclick' will be permitted inside 'a'. - *Note*: Attributes' specifications for an element may be set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence. - - '$spec' can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of '$spec' will permit the custom uses of the standard 'rel' attribute in 'input' (not permitted as per standards) and of a non-standard attribute, 'vFlag', in 'img'. - + *Note*: Attributes' specifications for an element may be (inadvertently) set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence. + + '$spec' can also be used to permit *custom or non-standard attributes*. Thus, the following value of '$spec' will permit the custom uses of the standard 'rel' attribute in 'input' (not permitted as per standards) and of a non-standard attribute, 'vFlag', in 'img'. + $spec = 'img=vFlag; input=rel' The attribute names must begin with an alphabet and cannot have space, equal-to (=) and solidus (/) characters. @@ -525,20 +535,20 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks: - * Allowing 'script', 'applet', 'embed', 'iframe', 'canvas', 'audio', 'video' or 'object' elements, or certain of their attributes like 'allowscriptaccess' + * Allowing 'script', 'applet', 'embed', 'iframe', 'canvas', 'audio', 'video', 'dialog' or 'object' elements, or certain of their attributes like 'allowscriptaccess' * Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '' - + * Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable) - + * Allowing the 'style' attribute - To remove `unsecure` HTML, code-developers using htmLawed must set '$config' appropriately. E.g., '$config["elements"] = "* -script"' to deny the 'script' element (section:- #3.3), '$config["safe"] = 1' to auto-configure ceratin htmLawed parameters for maximizing security (section:- #3.6), etc. - + To remove `unsecure` HTML, code-developers using htmLawed must set '$config' appropriately. E.g., '$config["elements"] = "* -script"' to deny the 'script' element (section:- #3.3), '$config["safe"] = 1' to auto-configure ceratin htmLawed parameters for maximizing security (section:- #3.6), etc. + Permitting the '*style*' attribute brings in risks of `click-jacking`, `phishing`, web-page overlays, etc., `even` when the 'safe' parameter is enabled (see section:- #3.6). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing 'style' completely and relying on CSS classes and stylesheet files is recommended. - + htmLawed does not check or correct the character *encoding* of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can allow for an exploit (like Google's `UTF-7/XSS` vulnerability of the past). - + Ocassionally, though very rarely, the default settings with which htmLawed runs may change between different versions of htmLawed. Admins should keep this in mind when upgrading htmLawed. Important changes in htmLawed's default behavior in new releases of the software are noted in section:- #4.5 on upgrades. @@ -573,7 +583,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern If the 'Kses' code has a non-empty hook function (e.g., 'wp_kses_hook()' in case of 'WordPress'), then the code for htmLawed's 'kses_hook()' function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With 'WordPress', the hook function is an essential one. The following code is suggested for the htmLawed 'kses_hook()' in case of 'WordPress': // kses compatibility - function kses_hook($string, &$cf, &$spec){ + function kses_hook($string, &$cf, &$spec){ $allowed_html = $spec; $allowed_protocols = array(); foreach($cf['schemes'] as $v){ @@ -600,7 +610,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Attribute values may be single- and not double-quoted. - * Left-padding of numeric entities (like, ' ', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though. + * Left-padding of numeric entities (like, ' ', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though. * Named character entities must be properly cased. Thus, '≪' or '&TILDE;' will not be recognized as entities and will be `neutralized`. @@ -616,7 +626,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc. - * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '' is processed into + * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '' is processed into ''. * Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant. @@ -632,9 +642,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern htmLawed's main objective is to make the input text `more` standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with possible work-arounds. It should be borne in mind that no browser application is 100% standard-compliant, standard specifications continue to evolve, and many browsers accept commonly used non-standard HTML. Regarding security, note that `unsafe` HTML code is not legally invalid per se. - - * By default, htmLawed will not strictly adhere to the `current` HTML standard. Admins can configure htmLawed to be more strict about standard compliance. Standard specification for HTML is continuously evolving. There are two bodies (W3C:- http://www.w3c.org and WHATWG:- http://www.whatwg.org) that specify the standard and their specifications are not identical. E.g., as in mid-2013, the 'border' attribute is valid in 'table' as per W3C but not WHATWG. Thus, htmLawed may not be fully compliant with the standard of a specific group. The HTML standards/rules that htmLawed uses in its logic are a mix of the W3C and WHATWG standards, and can be lax because of the laxity of HTML interpreters (browsers) regarding standards. - + + * htmLawed might not strictly adhere to `current` HTML standards as standard specification for HTML by WHATWG:- http://www.whatwg.org is continuously evolving, and there is laxity among HTML interpreters (browsers) regarding standards. Admins can configure htmLawed to be more strict about standard compliance. + * In general, htmLawed processes input to generate output that is most likely to be standard-compatible in most users' browsers. Thus, for example, it does not enforce the required value of '0' on 'border' attribute of 'img' (an HTML version 5 specification). * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frame-specific elements 'frameset', 'frame' and 'noframes'. However, content of the latter elements can be individually filtered through htmLawed. @@ -642,10 +652,10 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * It cannot handle input that has non-HTML code like 'SVG' and 'MathML'. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in section:- #3.9. A third way may be to some how take advantage of the '$config["and_mark"]' parameter (see section:- #3.2). * By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks on attribute values. + + * By default, htmLawed considers all ARIA, data-*, event, and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the 'itemtype' microdata attribute is permitted only in elements that also have the 'itemscope' attribute. Admins can configure htmLawed to be more strict about this (section:- #2.3). - * By default, htmLawed considers all ARIA, data-*, event and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the 'itemtype' microdata attribute is permitted only in elements that also have the 'itemscope' attribute. Admins can configure htmLawed to be more strict about this (section:- #2.3). - - * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, '$spec' can be used to allow custom attributes (section:- #2.3). + * The attributes, whether deprecated (which can be transformed by htmLawed) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, '$spec' can be used to allow custom attributes (section:- #2.3). * Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files. @@ -662,17 +672,17 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per standard specifications. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). - + * There are multiple ways to interpret ill-written HTML. E.g., in 'text', is it that the second closing tag for 'small' is missing or is it that the second opening tag for 'small' was put in by mistake? htmLawed corrects the HTML in the string assuming the former, while the user may have intended the string for the latter. This is an issue that is impossible to address perfectly. * htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted, Internet Explorer browser-specific dynamic expressions like 'expression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8). * htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., 'x'). These arise when browsers mis-identify markup in `escaped` text, defeating the very purpose of escaping text (a bad browser will read the given example as 'x'). - - * Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1). - + + * Because of inadequate Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1). + * htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's `UTF-7/XSS` vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect. - + * htmLawed is expected to work with input texts in ASCII standard-compatible single-byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable-byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double-byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple-byte encoding. If an input text has such an encoding, administrators can use PHP's iconv:- http://php.net/manual/en/book.iconv.php functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed. * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts. @@ -682,52 +692,52 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Safest, allowing only `safe` HTML markup -- - + $config = array('safe'=>1); $out = htmLawed($in, $config); - + Simplest, allowing all valid HTML markup including Javascript -- - + $out = htmLawed($in); - + Allowing all valid HTML markup but restricting URL schemes in 'src' attribute values to 'http' and 'https' -- - + $config = array('schemes'=>'*:*; src:http, https'); $out = htmLawed($in, $config); - + Allowing only 'safe' HTML and the elements 'a', 'em', and 'strong' -- - + $config = array('safe'=>1, 'elements'=>'a, em, strong'); $out = htmLawed($in, $config); - + Not allowing elements 'script' and 'object' -- - + $config = array('elements'=>'* -script -object'); $out = htmLawed($in, $config); - + Not allowing attributes 'id' and 'style' -- - + $config = array('deny_attribute'=>'id, style'); $out = htmLawed($in, $config); - + Permitting only attributes 'title' and 'href' -- - + $config = array('deny_attribute'=>'* -title -href'); $out = htmLawed($in, $config); - + Remove bad/disallowed tags altogether instead of converting them to entities -- - + $config = array('keep_bad'=>0); $out = htmLawed($in, $config); - + Allowing attribute 'title' only in 'a' and not allowing attributes 'id', 'style', or scriptable `on*` attributes like 'onclick' -- - + $config = array('deny_attribute'=>'title, id, style, on*'); $spec = 'a=title'; $out = htmLawed($in, $config, $spec); - + Allowing a custom attribute, 'vFlag', in 'img' and permitting custom use of the standard attribute, 'rel', in 'input' -- - + $spec = 'img=vFlag; input=rel'; $out = htmLawed($in, $config, $spec); @@ -776,7 +786,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like ' ' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like ' ' for non-breaking space), or '#' followed by a number (decimal numeric entity; like ' ' for non-breaking space). Character entities referring to the soft-hyphen character (the '­' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers. - htmLawed (function 'hl_ent()'): + htmLawed (function 'hl_entity()'): * Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous) @@ -786,7 +796,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, '7f' to '84', '86' to '9f', and 'fdd0' to 'fddf', or decimally, '127' to '132', '134' to '159', and '64991' to '64976'). Entities referring to the remaining discouraged characters (see section:- #5.1 for a full list) are let through. - * Neutralizes named entities that are not in the specifications + * Neutralizes named entities that are not in the HTML5 specification * Optionally converts valid HTML-specific named entities except '>', '<', '"', and '&' to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is '2') for generic XML-compliance. For this, '$config["named_entity"]' should be '1'. @@ -826,15 +836,23 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern See section:- #3.3.3 for differences between the various non-zero '$config["keep_bad"]' values. - htmLawed by default permits these 118 HTML elements: + htmLawed by default permits these 122 HTML elements: + + a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dialog, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, picture, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, slot, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, template, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr + + htmLawed also supports use of custom HTML elements, but this support can be turned off when $config is appropriately set (i.e., in default configuration, such elements are permitted); see section:- #3.3.6. + + Elements 'math' and 'svg' are not supported. They and their content will get `filtered` unless a strategy like in section:- #3.9 is used. - a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr + Elements like 'acronym', 'applet', 'basefont', 'bgsound', 'big', 'blink', 'center', 'command', 'dir', 'font', 'hgroup', 'image', 'keygen', 'marquee', 'menuitem', 'nobr', 'noembed', 'rb', 'rtc', 'shadow', 'spacer', 'strike', 'tt', and 'xmp' are currently obsolete/deprecated. Some of them, like 'acronym' and 'keygen', are supported in htmLawed (see above list). `Tag transformation` is possible for improving compliance with HTML standards -- most, but not all, of the obsolete/deprecated elements are converted to valid ones; see section:- #3.3.2. - The HTML version 4 elements 'acronym', 'applet', 'big', 'center', 'dir', 'font', 'strike', and 'tt' are obsolete/deprecated in HTML version 5. On the other hand, the obsolete/deprecated HTML 4 elements 'embed', 'menu' and 'u' are no longer so in HTML 5. Elements new to HTML 5 are 'article', 'aside', 'audio', 'bdi', 'canvas', 'command', 'data', 'datalist', 'details', 'figure', 'figcaption', 'footer', 'header', 'hgroup', 'keygen', 'link', 'main', 'mark', 'meta', 'meter', 'nav', 'output', 'progress', 'section', 'source', 'style', 'summary', 'time', 'track', 'video', and 'wbr'. The 'link', 'meta' and 'style' elements exist in HTML 4 but are not allowed in the HTML body. These 16 elements are `empty` elements that have an opening tag with possible content but no element content (thus, no closing tag): 'area', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', and 'wbr'. + These 16 htmLawed-supported elements are `empty` elements that have an opening tag with possible content but no element content (thus, no closing tag): 'area', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', and 'wbr'. - With '$config["safe"] = 1', the default set will exclude 'applet', 'audio', 'canvas', 'embed', 'iframe', 'object', 'script' and 'video'; see section:- #3.6. + As per standards, closing tags are optional for these elements under certain conditions: 'caption', 'colgroup', 'dd', 'dt', 'li', 'optgroup', 'option', 'p', 'rp', 'rt', 'tbody', 'td', 'tfoot', 'th', 'thead', and 'tr'. By default, htmLawed will add a missing closing tag for such elements, unless balancing (section:- #3.3.3) is turned off. - When '$config["elements"]', which specifies allowed elements, is `properly` defined, and neither empty nor set to '0' or '*', the default set is not used. To have elements added to or removed from the default set, a '+/-' notation is used. E.g., '*-script-object' implies that only 'script' and 'object' are disallowed, whereas '*+embed' means that 'noembed' is also allowed. Elements can also be specified as comma separated names. E.g., 'a, b, i' means only 'a', 'b' and 'i' are permitted. In this notation, '*', '+' and '-' have no significance and can actually cause a mis-reading. + With '$config["safe"] = 1', the default set of htmLawed-supported elements will exclude 'applet', 'audio', 'canvas', 'dialog', 'embed', 'iframe', 'object', 'script' and 'video'; see section:- #3.6. + + When '$config["elements"]', which specifies allowed elements, is `properly` defined, and neither empty nor set to '0' or '*', the default set is not used. To have elements added to or removed from the default set, a '+/-' notation is used. E.g., '*-script-object' implies that only 'script' and 'object' are disallowed, whereas '*+noembed' means that 'noembed' is also allowed. For an element with a hyphen in name, use round brackets around the name; e.g., '(my-custom-element)'. Elements can also be specified as comma separated names. E.g., 'a, b, i' means only 'a', 'b' and 'i' are permitted. In this notation, '*', '+' and '-' have no significance and can actually cause a mis-reading. Some more examples of '$config["elements"]' values indicating permitted elements (note that empty spaces are liberally allowed for clarity): @@ -842,6 +860,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * '*-script' -- all excluding 'script' * '* -acronym -big -center -dir -font -isindex -s -strike -tt' -- only non-obsolete/deprecated elements of HTML5 * '*+noembed-script' -- all including 'noembed' excluding 'script' + * '*+noembed+(my-custom-element)' -- all including 'noembed' and 'my-custom-element' Some mis-usages (and the resulting permitted elements) that can be avoided: @@ -855,11 +874,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Basically, when using the '+/-' notation, commas (',') should not be used, and vice versa, and '*' should be used with the former but not the latter. - *Note*: Even if an element that is not in the default set is allowed through '$config["elements"]', like 'noembed' in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ('$config["balance"]' set to '0'). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function 'hl_bal()' to accommodate the element and its nesting properties. - - A possible second way to specify allowed elements is to set '$config["parent"]' to an element name that supposedly will hold the input, and to set '$config["balance"]' to '1'. During tag balancing (see section:- #3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to 'div' if '$config["parent"]' is empty, 'body', or an element not in htmLawed's default set of 118 elements. + *Note*: Even if an element that is not in the default set is allowed through '$config["elements"]', like 'noembed' in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ('$config["balance"]' set to '0'). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function 'hl_balance()' to accommodate the element and its nesting properties. - `Tag transformation` is possible for improving compliance with HTML standards -- most of the obsolete/deprecated elements of HTML version 5 are converted to valid ones; see section:- #3.3.2. + A possible second way to specify allowed elements is to set '$config["parent"]' to an element name that supposedly will hold the input, and to set '$config["balance"]' to '1'. During tag balancing (see section:- #3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to 'div' if '$config["parent"]' is empty, 'body', or an element not in htmLawed's default set of 122 elements. .. 3.3.1 Handling of comments & CDATA sections ..................... @@ -867,7 +884,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 'CDATA' sections have the format '"...]]>', and HTML comments, '"... -->'. Neither HTML comments nor 'CDATA' sections can reside inside tags. HTML comments can exist anywhere else, but 'CDATA' sections can exist only where plain text is allowed (e.g., immediately inside 'td' element content but not immediately inside 'tr' element content). - htmLawed (function 'hl_cmtcd()') handles HTML comments or 'CDATA' sections depending on the values of '$config["comment"]' or '$config["cdata"]'. If '0', such markup is not looked for and the text is processed like plain text. If '1', it is removed completely. If '2', it is preserved but any '<', '>' and '&' inside are changed to entities. If '3' for '$config["cdata"]', or '3' or '4' for '$config["comment"]', they are left as such. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook). + htmLawed (function 'hl_commentCdata()') handles HTML comments or 'CDATA' sections depending on the values of '$config["comment"]' or '$config["cdata"]'. If '0', such markup is not looked for and the text is processed like plain text. If '1', it is removed completely. If '2', it is preserved but any '<', '>' and '&' inside are changed to entities. If '3' for '$config["cdata"]', or '3' or '4' for '$config["comment"]', they are left as such. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook). Note that for the last two cases, HTML comments and 'CDATA' sections will always be removed from tag content (function 'hl_tag()'). @@ -887,7 +904,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Home Output ('$config["comment"] = 4, $config["cdata"] = 3'): Home - + For standard-compliance, comments are given the form '', and any '--' in the content is made '-'. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker. When '$config["safe"] = 1', CDATA sections and comments are considered plain text unless '$config["comment"]' or '$config["cdata"]' is explicitly specified; see section:- #3.6. @@ -896,14 +913,14 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern .. 3.3.2 Tag-transformation for better compliance with standards ..o - If '$config["make_tag_strict"]' is set and not '0', following deprecated elements (and attributes), as per HTML 5 specification, even if admin-permitted, are mutated as indicated (element content remains intact; function 'hl_tag2()'): + If '$config["make_tag_strict"]' is set and not '0', following deprecated elements (and attributes), even if admin-permitted, are mutated as indicated (element content remains intact; function 'hl_deprecatedElement()'): * acronym - 'abbr' * applet - based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2') * big - 'span style="font-size: larger;"' * center - 'div style="text-align: center;"' * dir - 'ul' - * font (face, size, color) - 'span style="font-family: ; font-size: ; color: ;"' (size transformation reference:- http://style.cleverchimp.com/font_size_intervals/altintervals.html) + * font (face, size, color) - 'span style="font-family: ; font-size: ; color: ;"' (size transformation reference:- http://web.archive.org/web/20180201141931/http://style.cleverchimp.com/font_size_intervals/altintervals.html) * isindex - based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2') * s - 'span style="text-decoration: line-through;"' * strike - 'span style="text-decoration: line-through;"' @@ -920,14 +937,14 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern The output:
- The PHP software script used for this web-page web-page is htmLawedTest.php, from PHP Labware. + The PHP software script used for this web-page web-page is htmLawedTest.php, from PHP Labware.
.. 3.3.3 Tag balancing & proper nesting ...........................o - If '$config["balance"]' is set to '1', htmLawed (function 'hl_bal()') checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them). + If '$config["balance"]' is set to '1', htmLawed (function 'hl_balance()') checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them). Depending on the value of '$config["keep_bad"]' (see section:- #2.2 and section:- #3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities: @@ -968,16 +985,16 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern <*> Pseudo-tags <*> Non-HTML tag xml - + Disallowed tag p - + An option like '1' is useful, e.g., when a writer previews his submission, whereas one like '3' is useful before content is finalized and made available to all. *Note:* In the example above, unlike '<*>', '' gets considered as a tag (even though there is no HTML element named 'xml'). Thus, the 'keep_bad' parameter's value affects '' but not '<*>'. In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the 'keep_bad' value. - Nesting/content rules for each of the 118 elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_bal()'. This means that if a non-standard element besides 'embed' is being permitted through '$config["elements"]', the element's tag content will end up getting removed if '$config["balance"]' is set to '1'. + Nesting/content rules for each of the 122 standard elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_balance()'. Any custom element (section:- #3.3.6) is permitted to be within and to contain any other element. Plain text and/or certain elements nested inside 'blockquote', 'form', 'map' and 'noscript' need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as 'form', the input 'B:C:' is converted to '
B:C:
'. @@ -992,7 +1009,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern In some cases, the specifications stipulate the number and/or the ordering of the child elements. A 'table' can have 0 or 1 'caption', 'tbody', 'tfoot', and 'thead', but they must be in this order: 'caption', 'thead', 'tfoot', 'tbody'. htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages. - + With '$config["direct_list_nest"]' set to '1', htmLawed will allow direct nesting of 'ol', 'ul', or 'menu' list within another 'ol', 'ul', or 'menu' without requiring the child list to be within an 'li' of the parent list. While this may not be standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter '$config["direct_list_nest"]' has no effect if tag balancing (section:- #3.3.3) is turned off. @@ -1003,33 +1020,51 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside 'pre' elements) are all considered equivalent, and referred to as `white-spaces`. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space `normalization` allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such `pretty` HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome. - With the '$config' parameter 'tidy', htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides 'pre', the 'script' and 'textarea' elements, CDATA sections, and HTML comments are not subjected to the tidying process. + With the '$config' parameter 'tidy', htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides 'pre', the 'script', and 'textarea' elements, CDATA sections, and HTML comments are not subjected to the tidying process. + + Any custom HTML element (section:- #3.3.6) is treated like an inline element, like 'strong', during tidying. To `compact`, use '$config["tidy"] = -1'; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed. To `beautify`, '$config["tidy"]' is set as '1', or for customized tidying, as a string like '2s2n'. The 's' or 't' character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The 'r' and 'n' characters are used to specify line-break characters: 'n' for '\n' (Unix/Mac OS X line-breaks), 'rn' or 'nr' for '\r\n' (Windows/DOS line-breaks), or 'r' for '\r'. + For instance, with '$config["tidy"]' set as '3s2n', 3 space characters are used per indentation level, the entire block of text (HTML code) gets a lead (left spacing) of 2 space characters, and line-breaks are with '\n' character. + The '$config["tidy"]' value of '1' is equivalent to '2s0n'. Other '$config["tidy"]' values are read loosely: a value of '4' is equivalent to '4s0n'; 't2', to '1t2n'; 's', to '2s0n'; '2TR', to '2t0r'; 'T1', to '1t1n'; 'nr3', to '3s0nr', and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification. Input formatting using '$config["tidy"]' is not recommended when input text has mixed markup (like HTML + PHP). --- 3.4 Attributes -------------------------------------------------o +.. 3.3.6 Custom HTML elements .....................................o - In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in section:- #5.2. Using the '$spec' argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section:- #2.3). + Custom elements are HTML elements whose properties/behaviors are defined by the `author`, instead of being `universal` (i.e., defined by the HTML interpreter like a browser). Their names must begin with a lowercased a-z character, contain at least one hyphen (-), and cannot be: `annotation-xml, color-profile, font-face, font-face-src, font-face-uri, font-face-format, font-face-name, missing-glyph`. A huge variety of characters is permitted in the name. - Custom `data-*` (`data-star`) attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml', and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like 'aria-live', 'onclick' and 'itemid' are also considered global attributes (section:- #5.2). + 0-9 | . | _ | #xB7 | #xC0-#xD6 | #xD8-#xF6 | #xF8-#x37D | #x37F-#x1FFF | #x200C-#x200D | #x203F-#x2040 | #x2070-#x218F | #x2C00-#x2FEF | #x3001-#xD7FF | #xF900-#xFDCF | #xFDF0-#xFFFD | [#x10000-#xEFFFF] + + With '$config["any_custom_element"]' set to '0', no custom element is permitted, whereas with a value of '1' (default value), any such element is permitted. Regardless of the setting, specific custom elements can be denied or permitted through '$config["elements"]' (see section:- #3.3.1). - When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all attributes are permitted. Otherwise, '$config["deny_attribute"]' can be set as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting event attributes like 'onblur' and 'onchange' that have 'on' at the beginning of their names. Similarly, 'aria*' and 'data*' can be used to respectively refer to the set of all ARIA and data-* attributes. + Any custom HTML element is treated like an inline element, like 'strong', during tidying (section:- #3.3.5). During tag balancing (section:- #3.3.3), any custom element is permitted to be within and to contain any other element. These laxities are necessitated because, by definition, custom elements are parochial. + + Custom elements are permitted to have attributes of any name consisting of any character except a few such as equal, forward slash, and most control characters (unless denied through '$spec') and satisfying any 'data' attribute name requirement. + + +-- 3.4 Attributes -------------------------------------------------o + + + In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in section:- #5.2. Using the '$spec' argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section:- #2.3). + + Custom `data-*` (`data-star`) attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml', and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like 'aria-live', 'onclick' and 'itemid' are also considered global attributes (section:- #5.2). - With '$config["safe"] = 1' (section:- #3.6), the 'on*' event attributes are automatically disallowed even if a value for '$config["deny_attribute"]' has been manually provided. + When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all attributes are permitted as per standards. Otherwise, '$config["deny_attribute"]' can be set in two different ways. One way is as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting event attributes like 'onchange' that have 'on' at the beginning of their names. Similarly, 'aria*' and 'data*' can be used to respectively refer to the set of all ARIA and data-* attributes. The second way to set '$config["deny_attribute"]' permits the denying of all but a few attributes globally. The notation is '* -attribute1 -attribute2 ...'. Thus, a value of '* -title -href' implies that except 'href' and 'title' (where allowed as per standards) all other attributes are to be removed. Terms 'aria*' 'data*', and 'on*' can be used in this notation, and a whitespace character is necessary before the '-' character. + + With '$config["safe"] = 1' (section:- #3.6), any 'on*' event attribute is disallowed even if '$config["deny_attribute"]' is set otherwise (such as '* -style -on*'). - Note that attributes specified in '$config["deny_attribute"]' are denied globally, for all elements. To deny attributes for only specific elements, '$spec' (see section:- #2.3) can be used. '$spec' can also be used to element-specifically permit an attribute otherwise denied through '$config["deny_attribute"]'. + The attribute restrictions specified with '$config["deny_attribute"]' apply to all elements. To deny attributes for only specific elements, '$spec' (see section:- #2.3) can be used. '$spec' can also be used to element-specifically permit an attribute otherwise denied through '$config["deny_attribute"]'. - Finer restrictions on attributes can also be put into effect through '$config["deny_attribute"]' (section:- 3.4.9). + Finer restrictions on attributes can also be put into effect through '$config["hook_tag"]' (section:- #3.4.9). - *Note*: To deny all but a few attributes globally, a simpler way to specify '$config["deny_attribute"]' would be to use the notation '* -attribute1 -attribute2 ...'. Thus, a value of '* -title -href' implies that except 'href' and 'title' (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter 'safe' (section:- #3.6) will have no effect on 'deny_attribute'. Values of 'aria*' 'data*', and 'on*' cannot be used in this notation to refer to the sets of all ARIA, data-*, and on* attributes respectively. + Custom elements are permitted to have attributes of any name consisting of any character except a few such as equal, forward slash, and most control characters (unless denied through '$spec') and satisfying any 'data' attribute name requirement. htmLawed (function 'hl_tag()') also: @@ -1037,6 +1072,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Removes duplicate attributes (last one stays) * Gives attributes the form 'name="value"' and single-spaces them, removing unnecessary white-spacing * Provides `required` attributes (see section:- #3.4.1) + * Optionally lowercases certain standard attribute values (see section:- #3.4.5) * Double-quotes values and escapes any '"' inside them * Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point 'ad') in the values with spaces * Allows custom function to additionally filter/modify attribute values (see section:- #3.4.9) @@ -1085,22 +1121,22 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Also, only 'data', 'file', 'http', 'https' and 'javascript' are permitted in these attributes that accept URLs: - action, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, src, srcset, style, usemap, and event attributes like onclick + action, archive, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, poster, src, srcset, style, usemap, and event attributes like onclick With '$config["safe"] = 1' (section:- #3.6), the above is changed to disallow 'app', 'data' and 'javascript'. + *Note*: URLs in `data-*` attribute values are not checked, but $spec (section:- #2.3) or '$config["hook_tag"]' (section:- #3.4.9) can be used for this purpose. + These default sets are used when '$config["schemes"]' is not set (see section:- #2.2). To over-ride the defaults, '$config["schemes"]' is defined as a string of semi-colon-separated sub-strings of type 'attribute: comma-separated schemes'. E.g., 'href: mailto, http, https; onclick: javascript; src: http, https'. For unspecified attributes, 'data', 'file', 'http', 'https' and 'javascript' are permitted. This can be changed by passing schemes for '*' in '$config["schemes"]'. E.g., 'href: mailto, http, https; *: https, https'. - '*' (asterisk) can be put in the list of schemes to permit all protocols. E.g., 'style: *; img: http, https' results in protocols not being checked in 'style' attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section:- #3.4.4) is not done. When an attribute is explicitly listed in '$config["schemes"]', then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute. + '*' (asterisk) can be put in the list of schemes to permit all protocols. E.g., 'style: *; img: http, https' results in protocols not being checked in 'style' attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section:- #3.4.4) is not done. When an attribute is explicitly listed in '$config["schemes"]', then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute. Thus, `to allow the xmpp scheme`, one can set '$config["schemes"]' as 'href: mailto, http, https; *: http, https, xmpp', or 'href: mailto, http, https, xmpp; *: http, https, xmpp', or '*: *', and so on. The consequence of each of these example values will be different (e.g., only the last two but not the first will allow 'xmpp' in 'href') As a side-note, one may find 'style: *' useful as URLs in 'style' attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text. - + '!' can be put in the list of schemes to disallow all protocols as well as `local` URLs. Thus, with 'href: http, style: !', 'CNN' will become 'CNN' - *Note*: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string 'src' (e.g., 'dynsrc') or starts with 'o' (e.g., 'onbeforecopy'). - With '$config["safe"] = 1', all URLs are disallowed in the 'style' attribute values. @@ -1228,23 +1264,21 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern .. 3.4.9 Hook function for tag content ............................o - It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.). + It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.). The function should have two arguments, the first receiving an element name and the second receiving either '0' (in case of a closing tag) or an array of attribute name-value pairs (opening tag). It should return a string with full HTM markup, either an opening or a closing tag with element name and any string of attributes. - When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like '' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included), etc. - - Any 'hook_tag' function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as 'a' in the closing '' tag of the element 'CNN'. No other value is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like ''). + When '$config' parameter 'hook_tag' is set to the name of a function or class method, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like '' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included), etc. This is a *powerful functionality* that can be exploited for various objectives: consolidate-and-convert inline 'style' attributes to 'class', convert 'embed' elements to 'object', permit only one 'caption' element in a 'table' element, disallow embedding of certain types of media, *inject HTML*, use CSSTidy:- http://csstidy.sourceforge.net to sanitize 'style' attribute values, etc. As an example, the custom hook code below can be used to force a series of specifically ordered 'id' attributes on all elements, and a specific 'param' element inside all 'object' elements: function my_tag_function($element, $attribute_array=0){ - + // If second argument is not received, it means a closing tag is being handled if(is_numeric($attribute_array)){ return ""; } - + static $id = 0; // Remove any duplicate element if($element == 'param' && isset($attribute_array['allowscriptaccess'])){ @@ -1267,7 +1301,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern foreach($attribute_array as $k=>$v){ $string .= " {$k}=\"{$v}\""; } - + static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); return "<{$element}{$string}". (array_key_exists($element, $empty_elements) ? ' /' : ''). '>'. $new_element; @@ -1275,7 +1309,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern The 'hook_tag' parameter is different from the 'hook' parameter (section:- #3.7). - Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. + Snippets of hook function code developed by others may be available on the htmLawed:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed website. -- 3.5 Simple configuration directive for most valid XHTML --------o @@ -1290,42 +1324,42 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern `Safe` HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specifications. When elements such as 'script' and 'object', and attributes such as 'onmouseover' and 'style' are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered 'safe' depends on the nature of the web application and the trust-level accorded to its users. htmLawed allows an admin to use '$config["safe"]' to auto-adjust multiple '$config' parameters (such as 'elements' which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by '"' in section:- #2.2). Thus, one can pass the '$config' argument with a simpler value. Having the 'safe' parameter set to '1' is equivalent to setting the following '$config' parameters to the noted values : - + cdata - 0 comment - 0 deny_attribute - on* - elements - * -applet -audio -canvas -embed -iframe -object -script -video + elements - * -applet -audio -canvas -dialog -embed -iframe -object -script -video schemes - href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style: !; *:file, http, https - With 'safe' set to '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'audio', 'canvas', 'embed', 'iframe', 'object', 'script' and 'video' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, unless overridden by the value for parameter 'schemes' (see section:- #3.4.3), the schemes 'app', 'data' and 'javascript' are not permitted, and URLs with schemes are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"'. + With 'safe' set to '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'audio', 'canvas', 'dialog', 'embed', 'iframe', 'object', 'script' and 'video' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, unless overridden by the value for parameter 'schemes' (see section:- #3.4.3), the schemes 'app', 'data' and 'javascript' are not permitted, and URLs with schemes are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"'. Admins, however, may still want to completely deny the 'style' attribute, e.g., with code like $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style')); - Permitting the 'style' attribute brings in risks of `click-jacking`, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check 'style' values. It does provide ways for the code-developer implementing htmLawed to do such checks through the '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended. - + Permitting the 'style' attribute brings in risks of `click-jacking`, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check 'style' values. It does provide ways for the code-developer implementing htmLawed to do such checks through the '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended. + If a value for a parameter auto-set through 'safe' is still manually provided, then that value can over-ride the auto-set value. E.g., with '$config["safe"] = 1' and '$config["elements"] = "* +script"', 'script', but not 'applet', is allowed. Such over-ride does not occur for 'deny_attribute' (for legacy reason) when comma-separated attribute names are provided as the value for this parameter (section:- #3.4); instead htmLawed will add 'on*' to the value provided for 'deny_attribute'. - A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm. + A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm. -- 3.7 Using a hook function --------------------------------------o - If '$config["hook"]' is not set to '0', then htmLawed will allow preliminarily processed input to be altered by a hook function named by '$config["hook"]' before starting the main work (but after handling of characters, entities, HTML comments and 'CDATA' sections -- see code for function 'htmLawed()'). + If '$config["hook"]' is not set to '0', then htmLawed will allow preliminarily processed input to be altered by a function or class method named by '$config["hook"]' before starting the main work (but after handling of characters, entities, HTML comments and 'CDATA' sections -- see code for function 'htmLawed()'). The function should have three arguments – the processed input string, and the finalized '$config' and '$spec' arrays, in order – and it should return the string after any manipulation. The hook function also allows one to alter the `finalized` values of '$config' and '$spec'. Note that the 'hook' parameter is different from the 'hook_tag' parameter (section:- #3.4.9). - Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. + Snippets of hook function code developed by others may be available on the htmLawed:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed website. -- 3.8 Obtaining `finalized` parameter values ---------------------o - htmLawed can assign the `finalized` '$config' and '$spec' values to a variable named by '$config["show_setting"]'. The variable, made global by htmLawed, is set as an array with three keys: 'config', with the '$config' value, 'spec', with the '$spec' value, and 'time', with a value that is the Unix time (the output of PHP's 'microtime()' function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code. + htmLawed can assign the `finalized` '$config' and '$spec' values to a variable named by '$config["show_setting"]'. The variable, made global by htmLawed, is set as an array with four keys: 'config', with the '$config' value, 'spec', with the '$spec' value, 'time', with a value that is the Unix time (the output of PHP's 'microtime' function) when htmLawed completed filtering, and 'version', with htmLawed version. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code. The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers. @@ -1346,7 +1380,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern This code will not work if '$config["clean_ms_char"]' is set to '1' (section:- #3.1), in which case one should instead deploy a hook function (section:- #3.7). (htmLawed internally uses certain control characters, code-points '1' to '7', and use of these characters as markers in the logic of hook functions may cause issues.) Admins may also be able to use '$config["and_mark"]' to deal with such mixed markup; see section:- #3.2. - + == 4 Other =======================================================oo @@ -1354,7 +1388,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 4.1 Support ----------------------------------------------------- - Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net. + Software updates and forum-based community-support may be found at https://bioinformatics.org/phplabware/internal_utilities/htmLawed. -- 4.2 Known issues -----------------------------------------------o @@ -1370,58 +1404,72 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern `Version number - Release date. Notes` - 1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in '$config["schemes"]' + 1.2.11 - 23 January 2023. Fixes an XSS vulnerability arising from a lack of inspection for the alphabetical HTML entity for colon character in URLs - 1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4 + 1.2.10 - 5 November 2022. Class methods can now be specified as '$config' 'hook' and 'hook_tag' functions; corrects a PHP notice if '$config["schemes"]' mistakenly lacks colons. - 1.2.4 - 31 August 2017. Removes use of PHP 'create_function' function and '$php_errormsg' reserved variable (deprecated in PHP 7.2) + 1.2.9 - 2 July 2022. Improves parsing of '$config["deny_attribute"]' to permit spaces flanking comma characters and allow references to sets of all ARIA, data-* and event attributes; fixes parsing of '$spec' for data-* attribute rules; now permits use of 'aria*', 'data*', and 'on*' in '$spec'; now covers all named HTML entities of current standard specification (this increased htmLawed code size by ~40%); recognizes that closing tag may be omitted for 'caption', 'optgroup', 'rp', 'rt', and 'tbody' as well; recognizes that 'archive' and 'poster' attribute values can have URLs, which can be multiple; recognizes 'onloadend' as global attribute; renames some internal functions; improved standards-compliance for element nesting. - 1.2.3 - 5 July 2017. New option value of '4' for '$config["comments"]' to stop enforcing a space character before the '-->' comment-closing marker + 1.2.8 - 6 June 2022. Fixes incorrect formatting of HTML comments when '$config["comment"]' = 4; fixes misreading of entity-fied colon characters in 'style' attribute values; '$config["show_setting"]' now includes htmLawed version; improved PHP 8.2 code compatibility, and readability - 1.2.2 - 25 May 2017. Fix for a bug in parsing '$spec' that got introduced in version 1.2; also, '$spec' is now parsed to accommodate specifications for an HTML element when they are specified in multiple rules + 1.2.7 - 10 April 2022. Support for elements 'dialog', 'picture', 'slot', and 'template'; support for custom HTML elements; support for global attributes 'autocapitalize', 'autofocus', 'enterkeyhint', 'inputmode', 'is', and 'nonce'; support for 17 additional ARIA and 11 additional on* event handler attributes; support for attributes with names not beginning with a-z; fix for a minor bug arising during deprecated height/weight attribute transformation - 1.2.1.1 - 17 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes + 1.2.6 - 4 September 2021. Fixes a bug that arises when '$config["deny_attribute"]' has a 'data-*' attribute with > 1 hyphen character - 1.2.1 - 15 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes + 1.2.5 - 24 September 2019. Fixes two bugs in 'font' tag transformation + + 1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in '$config["schemes"]' + 1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4 + + 1.2.4 - 31 August 2017. Removes use of PHP 'create_function' function and '$php_errormsg' reserved variable (deprecated in PHP 7.2) + + 1.2.3 - 5 July 2017. New option value of '4' for '$config["comments"]' to stop enforcing a space character before the '-->' comment-closing marker + + 1.2.2 - 25 May 2017. Fix for a bug in parsing '$spec' that got introduced in version 1.2; also, '$spec' is now parsed to accommodate specifications for an HTML element when they are specified in multiple rules + + 1.2.1.1 - 17 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes + + 1.2.1 - 15 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes + 1.2 - 11 February 2017. (First beta release on 26 May 2013). Added support for HTML version 5; ARIA, data-* and microdata attributes; 'app', 'data', 'javascript' and 'tel' URL schemes (thus, 'javascript:' is not filtered in default mode). Removed support for code using Kses functions (see section:- #2.6). Changes in revisions to the beta releases are not noted here. 1.1.22 - 5 March 2016. Improved testing of attribute value rules specified in '$spec' - + 1.1.21 - 27 February 2016. Improvement and security fix in transforming 'font' element 1.1.20 - 9 June 2015. Fix for a potential security vulnerability arising from unescaped double-quote character in single-quoted attribute value of some deprecated elements when tag transformation is enabled; recognition for non-(HTML 4) standard 'allowfullscreen' attribute of 'iframe' - + 1.1.19 - 19 January 2015. Fix for a bug in cleaning of soft-hyphens in URL values, etc 1.1.18 - 2 August 2014. Fix for a potential security vulnerability arising from specially encoded text with serial opening tags - + 1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with 'e' modifier for compatibility with PHP 5.5. 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols - + 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality - + 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during 'tidying' when 'balance' is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like 'blockquote' - + 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes - - 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute - + + 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute + 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. '$config["hook_tag"]', if specified, now receives names of elements in closing tags. - + 1.1.10 - 22 October 2011. Fix for a bug in the 'tidy' functionality that caused the entire input to be replaced with a single space; new parameter, '$config["direct_list_nest"]' to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.) - + 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of 'li' within 'dir' - + 1.1.9.4 - 3 July 2010. Parameter 'schemes' now accepts '!' so any URL, even a local one, can be `denied`. An issue in which a second URL value in 'style' properties was not checked was fixed. - + 1.1.9.3 - 17 May 2010. Checks for correct nesting of 'param' - + 1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes - + 1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for 'flashvars' attribute for 'embed' - + 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice @@ -1433,7 +1481,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions 1.1.2 - 22 January 2009. Fixed bug in parsing of 'font' attributes during tag transformation - + 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["tidy"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug @@ -1454,7 +1502,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]' - 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions ('hl_tag()' and 'hl_prot()'); no error display with 'hl_regex()' + 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions; no error display during regex testing 1.0 - 2 November 2007. First release @@ -1473,18 +1521,18 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *Note:* The following upgrades may affect the functionality of a specific htmLawed installation: (1) From version 1.1-1.1.10 to 1.1.11 or later, if a 'hook_tag' function is in use: In version 1.1.11 and later, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a 'hook_tag' function receives only the element name. The 'hook_tag' function therefore may have to be edited. See section:- #3.4.9. - + (2) From version older than 1.2.beta to later, if htmLawed was used as Kses replacement with Kses code in use: In version 1.2.beta or later, htmLawed no longer provides direct support for code that uses Kses functions (see section:- #2.6). - + (3) From version older than 1.2 to later, if htmLawed is used without '$config["safe"]' set to 1: Unlike previous versions, htmLawed version 1.2 and later permit 'data' and 'javascript' URL schemes by default (see section:- #3.4.3). - Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip; for 1.1.1, http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip; and for 1.1.22, http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip. + Old versions of htmLawed may be available online. E.g., for version 1.0, check https://bioinformatics.org/phplabware/downloads/htmLawed1.zip; for 1.1.1, https://bioinformatics.org/phplabware/downloads/htmLawed111.zip; and for 1.1.22, https://bioinformatics.org/phplabware/downloads/htmLawed1122.zip. -- 4.6 Comparison with 'HTMLPurifier' -----------------------------o - The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2015): + The HTMLPurifier:- http://htmlpurifier.org PHP library by Edward Yang is a good HTML filtering script that uses object-oriented PHP code. Compared to htmLawed, as of year 2015, HTMLPurifier: * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2) @@ -1492,25 +1540,27 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory) - * is expectedly slower + * is expectedly considerably slower * lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification) * has poor documentation - However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website:- http://htmlpurifier.org for updated information. + * may have finer checks for character encodings and attribute values + + * can log warnings and errors -- 4.7 Use through application plug-ins/modules -------------------o - Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed forum:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. + Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed forum:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed. -- 4.8 Use in non-PHP applications --------------------------------o - Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. + Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site:- https://bioinformatics.org/phplabware/internal_utilities/htmLawed. -- 4.9 Donate -----------------------------------------------------o @@ -1522,7 +1572,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 4.10 Acknowledgements ------------------------------------------o - Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Alexandre Chouinard, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, Edward Yang, and many anonymous users. + Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Michael Butler, Dac Chartrand, Alexandre Chouinard, NinCollin, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Hideki Mitsuda, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, walrusmoose, Edward Yang, and many others. Thank you! @@ -1541,11 +1591,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 5.2 Valid attribute-element combinations -----------------------o - * includes deprecated attributes (marked '^'), attributes for microdata (marked '*'), the non-standard 'bordercolor', and new-in-HTML5 attributes (marked '~'); can have multiple comma-separated values (marked '%'); can have multiple space-separated values (marked '$') + * includes deprecated attributes (marked '^'), attributes for microdata (marked '*'), some non-standard attributes for 'embed' (marked '**'), and the non-standard 'bordercolor'; can have multiple comma-separated values (marked '%'); can have multiple space-separated values (marked '$') * only non-frameset, HTML body elements * 'name' for 'a' and 'map', and 'lang' are invalid in XHTML 1.1 - * 'target' is valid for 'a' in XHTML 1.1 and higher * 'xml:space' is only for XHTML 1.1 + * excludes data-* and author-specified, non-standard attributes of custom elements abbr - td, th accept - form, input @@ -1555,17 +1605,17 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern allowfullscreen - iframe alt - applet, area, img, input archive - applet, object - async~ - script - autocomplete~ - input - autofocus~ - button, input, keygen, select, textarea - autoplay~ - audio, video + async - script + autocomplete - input + autofocus - button, input, keygen, select, textarea + autoplay - audio, video axis - td, th bgcolor - embed, table^, td^, th^, tr^ border - img, object^, table bordercolor - table, td, tr cellpadding - table cellspacing - table - challenge~ - keygen + challenge - keygen char - col, colgroup, tbody, td, tfoot, th, thead, tr charoff - col, colgroup, tbody, td, tfoot, th, thead, tr charset - a, script @@ -1581,94 +1631,94 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern colspan - td, th compact - dir, dl^, menu, ol^, ul^ content - meta - controls~ - audio, video + controls - audio, video coords - area, a - crossorigin~ - img + crossorigin - img data - object datetime - del, ins, time declare - object - default~ - track + default - track defer - script dir - bdo - dirname~ - input, textarea + dirname - input, textarea disabled - button, command, fieldset, input, keygen, optgroup, option, select, textarea - download~ - a + download - a enctype - form face - font flashvars** - embed for - label, output - form~ - button, fieldset, input, keygen, label, object, output, select, textarea - formaction~ - button, input - formenctype~ - button, input - formmethod~ - button, input - formnovalidate~ - button, input - formtarget~ - button, input + form - button, fieldset, input, keygen, label, object, output, select, textarea + formaction - button, input + formenctype - button, input + formmethod - button, input + formnovalidate - button, input + formtarget - button, input frame - table frameborder - iframe headers - td, th height - applet, canvas, embed, iframe, img, input, object, td^, th^, video - high~ - meter + high - meter href - a, area, link hreflang - a, area, link hspace - applet, embed, img^, object^ - icon~ - command + icon - command ismap - img, input - keytype~ - keygen - keyparams~ - keygen - kind~ - track + keytype - keygen + keyparams - keygen + kind - track label - command, menu, option, optgroup, track language - script^ - list~ - input + list - input longdesc - img, iframe - loop~ - audio, video - low~ - meter + loop - audio, video + low - meter marginheight - iframe marginwidth - iframe - max~ - input, meter, progress + max - input, meter, progress maxlength - input, textarea - media~ - a, area, link, source, style - mediagroup~ - audio, video + media - a, area, link, source, style + mediagroup - audio, video method - form - min~ - input, meter + min - input, meter model** - embed multiple - input, select - muted~ - audio, video - name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, textarea + muted - audio, video + name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, slot, textarea nohref - area noshade - hr^ - novalidate~ - form + novalidate - form nowrap - td^, th^ object - applet - open~ - details - optimum~ - meter - pattern~ - input - ping~ - a, area - placeholder~ - input, textarea + open - details, dialog + optimum - meter + pattern - input + ping - a, area + placeholder - input, textarea pluginspage** - embed pluginurl** - embed - poster~ - video - pqg~ - keygen - preload~ - audio, video + poster - video + pqg - keygen + preload - audio, video prompt - isindex - pubdate~ - time + pubdate - time radiogroup* - command readonly - input, textarea - required~ - input, select, textarea + required - input, select, textarea rel$ - a, area, link rev - a - reversed~ - old + reversed - old rows - textarea rowspan - td, th rules - table - sandbox~ - iframe + sandbox - iframe scope - td, th - scoped~ - style + scoped - style scrolling - iframe - seamless~ - iframe + seamless - iframe selected - option shape - area, a size - font, hr^, input, select - sizes~ - link + sizes - link span - col, colgroup src - audio, embed, iframe, img, input, script, source, track, video srcdoc~ - iframe @@ -1692,10 +1742,10 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern The following attributes, including event-specific ones and attributes of ARIA and microdata specifications, are considered global and allowed in all elements: - accesskey, aria-activedescendant, aria-atomic, aria-autocomplete, aria-busy, aria-checked, aria-controls, aria-describedby, aria-disabled, aria-dropeffect, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class$, contenteditable, contextmenu, dir, draggable, dropzone, hidden, id, inert, itemid, itemprop, itemref, itemscope, itemtype, lang, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, role, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space + accesskey, autocapitalize, autofocus, aria-activedescendant, aria-atomic, aria-autocomplete, aria-braillelabel, aria-brailleroledescription, aria-busy, aria-checked, aria-colcount, aria-colindex, aria-colindextext, aria-colspan, aria-controls, aria-current, aria-describedby, aria-description, aria-details, aria-disabled, aria-dropeffect, aria-errormessage, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-keyshortcuts, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-placeholder, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-roledescription, aria-rowcount, aria-rowindex, aria-rowindextext, aria-rowspan, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class, contenteditable, contextmenu, dir, draggable, dropzone, enterkeyhint, hidden, id, inert, inputmode, is, itemid, itemprop, itemref, itemscope, itemtype, lang, nonce, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadend, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, onauxclick, oncancel, onclose, oncontextlost, oncontextrestored, onformdata, onmouseenter, onmouseleave, onresize, onsecuritypolicyviolation, onslotchange, role, slot, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space Custom `data-*` attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml' and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space, tab, or any A-Z character, are also considered global and allowed in all elements. - + -- 5.3 CSS 2.1 properties accepting URLs --------------------------o @@ -1784,22 +1834,22 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Except for the main 'htmLawed()' function, htmLawed's functions are *name-spaced* using the 'hl_' prefix. The *functions* and their roles are: - * 'hl_attrval' - check attribute values against '$spec' - * 'hl_bal' - balance tags and ensure proper nesting - * 'hl_cmtcd' - handle CDATA sections and HTML comments - * 'hl_ent' - handle character entities - * 'hl_prot' - check a URL scheme/protocol + * 'hl_attributeValue' - check attribute values against '$spec' rules + * 'hl_balance' - balance tags and ensure proper nesting + * 'hl_commentCdata' - handle CDATA sections and HTML comments + * 'hl_deprecatedElement' - transform element tags + * 'hl_entity' - handle character entities * 'hl_regex' - check syntax of a regular expression - * 'hl_spec' - convert user-supplied '$spec' value to one used internally + * 'hl_spec' - convert '$spec' value to one used internally * 'hl_tag' - handle element tags and attributes - * 'hl_tag2' - transform element tags - * 'hl_tidy' - compact/beautify HTML + * 'hl_tidy' - compact/beautify HTML + * 'hl_url' - check URL-containing values * 'hl_version' - report htmLawed version * 'htmLawed' - main function - 'htmLawed()' finalizes '$spec' (with the help of 'hl_spec()') and '$config', and globalizes them. Finalization of '$config' involves setting default values if an inappropriate or invalid one is supplied. This includes calling 'hl_regex()' to check well-formedness of regular expression patterns if such expressions are user-supplied through '$config'. 'htmLawed()' then removes invalid characters like nulls and 'x01' and appropriately handles entities using 'hl_ent()'. HTML comments and CDATA sections are identified and treated as per '$config' with the help of 'hl_cmtcd()'. When retained, the '<' and '>' characters identifying them, and the '<', '>' and '&' characters inside them, are replaced with control characters (code-points '1' to '5') till any tag balancing is completed. + 'htmLawed()' finalizes '$spec' (with the help of 'hl_spec()') and '$config', and globalizes them. Finalization of '$config' involves setting default values if an inappropriate or invalid one is supplied. This includes calling 'hl_regex()' to check well-formedness of regular expression patterns if such expressions are user-supplied through '$config'. 'htmLawed()' then removes invalid characters like nulls and 'x01' and appropriately handles entities using 'hl_entity()'. HTML comments and CDATA sections are identified and treated as per '$config' with the help of 'hl_commentCdata()'. When retained, the '<' and '>' characters identifying them, and the '<', '>' and '&' characters inside them, are replaced with control characters (code-points '1' to '5') till any tag balancing is completed. - After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7'). + After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_deprecatedElement()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attributeValue()', and checks URL protocols using 'hl_url()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_balance()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7'). htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized '$config' and '$spec' values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9). diff --git a/htmLawed_TESTCASE.txt b/htmLawed_TESTCASE.txt index c658d28..cf873b2 100755 --- a/htmLawed_TESTCASE.txt +++ b/htmLawed_TESTCASE.txt @@ -1,5 +1,5 @@ /* -htmLawed_TESTCASE.txt, 11 February 2017 +htmLawed_TESTCASE.txt, 23 January 2023 To test htmLawed Copyright Santosh Patnaik Dual licensed with LGPL 3 and GPL 2+ @@ -138,6 +138,16 @@ Disallowed tag p Content invalid?:

(try setting 'form' as parent)
Casing:
Check for tidy:



hi
+Customized element: +Custom element: Click me?A beautiful tree towering over an empty savannah +Custom element: + Facebook + G+ + xx + + +Math: 2 = 2 +SVG:
Entities
@@ -370,6 +380,7 @@ na Alemanha.
Tag transformation
+Font element with malicious code:


Font element intended as 'inline' element:

hi


Font element intended as 'block' element:
hi

Font element intended as 'block' element:
hi
QQQ

@@ -390,6 +401,7 @@ na Alemanha. <img onmouseover=confirm(1)// '';!--"=&{()}
+