0byt3m1n1
Path:
/
data
/
applications
/
aps
/
typo3
/
4.5.5-0
/
standard
/
htdocs
/
t3lib
/
[
Home
]
File: class.t3lib_parsehtml.php
<?php /*************************************************************** * Copyright notice * * (c) 1999-2011 Kasper Skårhøj (kasperYYYY@typo3.com) * All rights reserved * * This script is part of the TYPO3 project. The TYPO3 project is * free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * The GNU General Public License can be found at * http://www.gnu.org/copyleft/gpl.html. * A copy is found in the textfile GPL.txt and important notices to the license * from the author is found in LICENSE.txt distributed with these scripts. * * * This script is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * This copyright notice MUST APPEAR in all copies of the script! ***************************************************************/ /** * Contains class with functions for parsing HTML code. * * $Id$ * Revised for TYPO3 3.6 July/2003 by Kasper Skårhøj * * @author Kasper Skårhøj <kasperYYYY@typo3.com> */ /** * [CLASS/FUNCTION INDEX of SCRIPT] * * * * 106: class t3lib_parsehtml * 123: function getSubpart($content, $marker) * 156: function substituteSubpart($content,$marker,$subpartContent,$recursive=1,$keepMarker=0) * * SECTION: Parsing HTML code * 247: function splitIntoBlock($tag,$content,$eliminateExtraEndTags=0) * 308: function splitIntoBlockRecursiveProc($tag,$content,&$procObj,$callBackContent,$callBackTags,$level=0) * 344: function splitTags($tag,$content) * 378: function getAllParts($parts,$tag_parts=1,$include_tag=1) * 396: function removeFirstAndLastTag($str) * 412: function getFirstTag($str) * 426: function getFirstTagName($str,$preserveCase=FALSE) * 445: function get_tag_attributes($tag,$deHSC=0) * 486: function split_tag_attributes($tag) * 524: function checkTagTypeCounts($content,$blockTags='a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags='br,hr,img,input,area') * * SECTION: Clean HTML code * 617: function HTMLcleaner($content, $tags=array(),$keepAll=0,$hSC=0,$addConfig=array()) * 814: function bidir_htmlspecialchars($value,$dir) * 837: function prefixResourcePath($main_prefix,$content,$alternatives=array(),$suffix='') * 919: function prefixRelPath($prefix,$srcVal,$suffix='') * 937: function cleanFontTags($value,$keepFace=0,$keepSize=0,$keepColor=0) * 967: function mapTags($value,$tags=array(),$ltChar='<',$ltChar2='<') * 982: function unprotectTags($content,$tagList='') * 1015: function stripTagsExcept($value,$tagList) * 1038: function caseShift($str,$flag,$cacheKey='') * 1065: function compileTagAttribs($tagAttrib,$meta=array(), $xhtmlClean=0) * 1093: function get_tag_attributes_classic($tag,$deHSC=0) * 1106: function indentLines($content, $number=1, $indentChar=TAB) * 1123: function HTMLparserConfig($TSconfig,$keepTags=array()) * 1247: function XHTML_clean($content) * 1269: function processTag($value,$conf,$endTag,$protected=0) * 1315: function processContent($value,$dir,$conf) * * TOTAL FUNCTIONS: 28 * (This index is automatically created/updated by the extension "extdeveval") * */ /** * Functions for parsing HTML. * You are encouraged to use this class in your own applications * * @author Kasper Skårhøj <kasperYYYY@typo3.com> * @package TYPO3 * @subpackage t3lib */ class t3lib_parsehtml { protected $caseShift_cache = array(); // Void elements that do not have closing tags, as defined by HTML5, except link element const VOID_ELEMENTS = 'area|base|br|col|command|embed|hr|img|input|keygen|meta|param|source|track|wbr'; /** * Returns the first subpart encapsulated in the marker, $marker * (possibly present in $content as a HTML comment) * * @param string Content with subpart wrapped in fx. "###CONTENT_PART###" inside. * @param string Marker string, eg. "###CONTENT_PART###" * @return string */ public static function getSubpart($content, $marker) { $start = strpos($content, $marker); if ($start === FALSE) { return ''; } $start += strlen($marker); $stop = strpos($content, $marker, $start); // Q: What shall get returned if no stop marker is given // /*everything till the end*/ or nothing? if ($stop === FALSE) { return ''; /*substr($content, $start)*/ } $content = substr($content, $start, $stop - $start); $matches = array(); if (preg_match('/^([^\<]*\-\-\>)(.*)(\<\!\-\-[^\>]*)$/s', $content, $matches) === 1) { return $matches[2]; } $matches = array(); // resetting $matches if (preg_match('/(.*)(\<\!\-\-[^\>]*)$/s', $content, $matches) === 1) { return $matches[1]; } $matches = array(); // resetting $matches if (preg_match('/^([^\<]*\-\-\>)(.*)$/s', $content, $matches) === 1) { return $matches[2]; } return $content; } /** * Substitutes a subpart in $content with the content of $subpartContent. * * @param string Content with subpart wrapped in fx. "###CONTENT_PART###" inside. * @param string Marker string, eg. "###CONTENT_PART###" * @param array If $subpartContent happens to be an array, it's [0] and [1] elements are wrapped around the content of the subpart (fetched by getSubpart()) * @param boolean If $recursive is set, the function calls itself with the content set to the remaining part of the content after the second marker. This means that proceding subparts are ALSO substituted! * @param boolean If set, the marker around the subpart is not removed, but kept in the output * @return string Processed input content */ public static function substituteSubpart($content, $marker, $subpartContent, $recursive = 1, $keepMarker = 0) { $start = strpos($content, $marker); if ($start === FALSE) { return $content; } $startAM = $start + strlen($marker); $stop = strpos($content, $marker, $startAM); if ($stop === FALSE) { return $content; } $stopAM = $stop + strlen($marker); $before = substr($content, 0, $start); $after = substr($content, $stopAM); $between = substr($content, $startAM, $stop - $startAM); if ($recursive) { $after = self::substituteSubpart( $after, $marker, $subpartContent, $recursive, $keepMarker ); } if ($keepMarker) { $matches = array(); if (preg_match('/^([^\<]*\-\-\>)(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches) === 1) { $before .= $marker . $matches[1]; $between = $matches[2]; $after = $matches[3] . $marker . $after; } elseif (preg_match('/^(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches) === 1) { $before .= $marker; $between = $matches[1]; $after = $matches[2] . $marker . $after; } elseif (preg_match('/^([^\<]*\-\-\>)(.*)$/s', $between, $matches) === 1) { $before .= $marker . $matches[1]; $between = $matches[2]; $after = $marker . $after; } else { $before .= $marker; $after = $marker . $after; } } else { $matches = array(); if (preg_match('/^(.*)\<\!\-\-[^\>]*$/s', $before, $matches) === 1) { $before = $matches[1]; } if (is_array($subpartContent)) { $matches = array(); if (preg_match('/^([^\<]*\-\-\>)(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches) === 1) { $between = $matches[2]; } elseif (preg_match('/^(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches) === 1) { $between = $matches[1]; } elseif (preg_match('/^([^\<]*\-\-\>)(.*)$/s', $between, $matches) === 1) { $between = $matches[2]; } } $matches = array(); // resetting $matches if (preg_match('/^[^\<]*\-\-\>(.*)$/s', $after, $matches) === 1) { $after = $matches[1]; } } if (is_array($subpartContent)) { $between = $subpartContent[0] . $between . $subpartContent[1]; } else { $between = $subpartContent; } return $before . $between . $after; } /** * Substitues multiple subparts at once * * @param string The content stream, typically HTML template content. * @param array The array of key/value pairs being subpart/content values used in the substitution. For each element in this array the function will substitute a subpart in the content stream with the content. * @return string The processed HTML content string. */ public static function substituteSubpartArray($content, array $subpartsContent) { foreach ($subpartsContent as $subpartMarker => $subpartContent) { $content = self::substituteSubpart( $content, $subpartMarker, $subpartContent ); } return $content; } /** * Substitutes a marker string in the input content * (by a simple str_replace()) * * @param string The content stream, typically HTML template content. * @param string The marker string, typically on the form "###[the marker string]###" * @param mixed The content to insert instead of the marker string found. * @return string The processed HTML content string. * @see substituteSubpart() */ public static function substituteMarker($content, $marker, $markContent) { return str_replace($marker, $markContent, $content); } /** * Traverses the input $markContentArray array and for each key the marker * by the same name (possibly wrapped and in upper case) will be * substituted with the keys value in the array. This is very useful if you * have a data-record to substitute in some content. In particular when you * use the $wrap and $uppercase values to pre-process the markers. Eg. a * key name like "myfield" could effectively be represented by the marker * "###MYFIELD###" if the wrap value was "###|###" and the $uppercase * boolean true. * * @param string The content stream, typically HTML template content. * @param array The array of key/value pairs being marker/content values used in the substitution. For each element in this array the function will substitute a marker in the content stream with the content. * @param string A wrap value - [part 1] | [part 2] - for the markers before substitution * @param boolean If set, all marker string substitution is done with upper-case markers. * @param boolean If set, all unused marker are deleted. * @return string The processed output stream * @see substituteMarker(), substituteMarkerInObject(), TEMPLATE() */ public static function substituteMarkerArray($content, $markContentArray, $wrap = '', $uppercase = 0, $deleteUnused = 0) { if (is_array($markContentArray)) { $wrapArr = t3lib_div::trimExplode('|', $wrap); foreach ($markContentArray as $marker => $markContent) { if ($uppercase) { // use strtr instead of strtoupper to avoid locale problems with Turkish $marker = strtr( $marker, 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' ); } if (count($wrapArr) > 0) { $marker = $wrapArr[0] . $marker . $wrapArr[1]; } $content = str_replace($marker, $markContent, $content); } if ($deleteUnused) { if (empty($wrap)) { $wrapArr = array('###', '###'); } $content = preg_replace('/' . preg_quote($wrapArr[0]) . '([A-Z0-9_|\-]*)' . preg_quote($wrapArr[1]) . '/is', '', $content); } } return $content; } /************************************ * * Parsing HTML code * ************************************/ /** * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag * Even numbers in the array are outside the blocks, Odd numbers are block-content. * Use ->getAllParts() and ->removeFirstAndLastTag() to process the content if needed. * * @param string List of tags, comma separated. * @param string HTML-content * @param boolean If set, excessive end tags are ignored - you should probably set this in most cases. * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content. * @see splitTags(), getAllParts(), removeFirstAndLastTag() */ function splitIntoBlock($tag, $content, $eliminateExtraEndTags = 0) { $tags = array_unique(t3lib_div::trimExplode(',', $tag, 1)); $regexStr = '/\<\/?(' . implode('|', $tags) . ')(\s*\>|\s[^\>]*\>)/si'; $parts = preg_split($regexStr, $content); $newParts = array(); $pointer = strlen($parts[0]); $buffer = $parts[0]; $nested = 0; reset($parts); next($parts); while (list($k, $v) = each($parts)) { $isEndTag = substr($content, $pointer, 2) == '</' ? 1 : 0; $tagLen = strcspn(substr($content, $pointer), '>') + 1; if (!$isEndTag) { // We meet a start-tag: if (!$nested) { // Ground level: $newParts[] = $buffer; // previous buffer stored $buffer = ''; } $nested++; // We are inside now! $mbuffer = substr($content, $pointer, strlen($v) + $tagLen); // New buffer set and pointer increased $pointer += strlen($mbuffer); $buffer .= $mbuffer; } else { // If we meet an endtag: $nested--; // decrease nested-level $eliminated = 0; if ($eliminateExtraEndTags && $nested < 0) { $nested = 0; $eliminated = 1; } else { $buffer .= substr($content, $pointer, $tagLen); // In any case, add the endtag to current buffer and increase pointer } $pointer += $tagLen; if (!$nested && !$eliminated) { // if we're back on ground level, (and not by eliminating tags... $newParts[] = $buffer; $buffer = ''; } $mbuffer = substr($content, $pointer, strlen($v)); // New buffer set and pointer increased $pointer += strlen($mbuffer); $buffer .= $mbuffer; } } $newParts[] = $buffer; return $newParts; } /** * Splitting content into blocks *recursively* and processing tags/content with call back functions. * * @param string Tag list, see splitIntoBlock() * @param string Content, see splitIntoBlock() * @param object Object where call back methods are. * @param string Name of call back method for content; "function callBackContent($str,$level)" * @param string Name of call back method for tags; "function callBackTags($tags,$level)" * @param integer Indent level * @return string Processed content * @see splitIntoBlock() */ function splitIntoBlockRecursiveProc($tag, $content, &$procObj, $callBackContent, $callBackTags, $level = 0) { $parts = $this->splitIntoBlock($tag, $content, TRUE); foreach ($parts as $k => $v) { if ($k % 2) { $firstTagName = $this->getFirstTagName($v, TRUE); $tagsArray = array(); $tagsArray['tag_start'] = $this->getFirstTag($v); $tagsArray['tag_end'] = '</' . $firstTagName . '>'; $tagsArray['tag_name'] = strtolower($firstTagName); $tagsArray['add_level'] = 1; $tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag, $this->removeFirstAndLastTag($v), $procObj, $callBackContent, $callBackTags, $level + $tagsArray['add_level']); if ($callBackTags) { $tagsArray = $procObj->$callBackTags($tagsArray, $level); } $parts[$k] = $tagsArray['tag_start'] . $tagsArray['content'] . $tagsArray['tag_end']; } else { if ($callBackContent) { $parts[$k] = $procObj->$callBackContent($parts[$k], $level); } } } return implode('', $parts); } /** * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag * Even numbers in the array are outside the blocks, Odd numbers are block-content. * Use ->getAllParts() and ->removeFirstAndLastTag() to process the content if needed. * * @param string List of tags * @param string HTML-content * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content. * @see splitIntoBlock(), getAllParts(), removeFirstAndLastTag() */ function splitTags($tag, $content) { $tags = t3lib_div::trimExplode(',', $tag, 1); $regexStr = '/\<(' . implode('|', $tags) . ')(\s[^>]*)?\/?>/si'; $parts = preg_split($regexStr, $content); $pointer = strlen($parts[0]); $newParts = array(); $newParts[] = $parts[0]; reset($parts); next($parts); while (list($k, $v) = each($parts)) { $tagLen = strcspn(substr($content, $pointer), '>') + 1; // Set tag: $tag = substr($content, $pointer, $tagLen); // New buffer set and pointer increased $newParts[] = $tag; $pointer += strlen($tag); // Set content: $newParts[] = $v; $pointer += strlen($v); } return $newParts; } /** * Returns an array with either tag or non-tag content of the result from ->splitIntoBlock()/->splitTags() * * @param array Parts generated by ->splitIntoBlock() or >splitTags() * @param boolean Whether to return the tag-parts (default,true) or what was outside the tags. * @param boolean Whether to include the tags in the tag-parts (most useful for input made by ->splitIntoBlock()) * @return array Tag-parts/Non-tag-parts depending on input argument settings * @see splitIntoBlock(), splitTags() */ function getAllParts($parts, $tag_parts = 1, $include_tag = 1) { $newParts = array(); foreach ($parts as $k => $v) { if (($k + ($tag_parts ? 0 : 1)) % 2) { if (!$include_tag) { $v = $this->removeFirstAndLastTag($v); } $newParts[] = $v; } } return $newParts; } /** * Removes the first and last tag in the string * Anything before the first and after the last tags respectively is also removed * * @param string String to process * @return string */ function removeFirstAndLastTag($str) { // End of first tag: $start = strpos($str, '>'); // Begin of last tag: $end = strrpos($str, '<'); // return return substr($str, $start + 1, $end - $start - 1); } /** * Returns the first tag in $str * Actually everything from the begining of the $str is returned, so you better make sure the tag is the first thing... * * @param string HTML string with tags * @return string */ function getFirstTag($str) { // First: $endLen = strpos($str, '>') + 1; return substr($str, 0, $endLen); } /** * Returns the NAME of the first tag in $str * * @param string HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do) * @param boolean If set, then the tag is NOT converted to uppercase by case is preserved. * @return string Tag name in upper case * @see getFirstTag() */ function getFirstTagName($str, $preserveCase = FALSE) { $matches = array(); if (preg_match('/^\s*\<([^\s\>]+)(\s|\>)/', $str, $matches) === 1) { if (!$preserveCase) { return strtoupper($matches[1]); } return $matches[1]; } return ''; } /** * Returns an array with all attributes as keys. Attributes are only lowercase a-z * If a attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset() * * @param string Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>') * @param boolean If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set! * @return array array(Tag attributes,Attribute meta-data) */ function get_tag_attributes($tag, $deHSC = 0) { list($components, $metaC) = $this->split_tag_attributes($tag); $name = ''; // attribute name is stored here $valuemode = FALSE; $attributes = array(); $attributesMeta = array(); if (is_array($components)) { foreach ($components as $key => $val) { if ($val != '=') { // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value if ($valuemode) { if ($name) { $attributes[$name] = $deHSC ? t3lib_div::htmlspecialchars_decode($val) : $val; $attributesMeta[$name]['dashType'] = $metaC[$key]; $name = ''; } } else { if ($namekey = preg_replace('/[^[:alnum:]_\:\-]/', '', $val)) { $name = strtolower($namekey); $attributesMeta[$name] = array(); $attributesMeta[$name]['origTag'] = $namekey; $attributes[$name] = ''; } } $valuemode = FALSE; } else { $valuemode = TRUE; } } return array($attributes, $attributesMeta); } } /** * Returns an array with the 'components' from an attribute list. The result is normally analyzed by get_tag_attributes * Removes tag-name if found * * @param string The tag or attributes * @return array * @access private * @see t3lib_div::split_tag_attributes() */ function split_tag_attributes($tag) { $matches = array(); if (preg_match('/(\<[^\s]+\s+)?(.*?)\s*(\>)?$/s', $tag, $matches) !== 1) { return array(array(), array()); } $tag_tmp = $matches[2]; $metaValue = array(); $value = array(); $matches = array(); if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\s"\'\=]+|\=)/s', $tag_tmp, $matches) > 0) { foreach ($matches[1] as $part) { $firstChar = substr($part, 0, 1); if ($firstChar == '"' || $firstChar == "'") { $metaValue[] = $firstChar; $value[] = substr($part, 1, -1); } else { $metaValue[] = ''; $value[] = $part; } } } return array($value, $metaValue); } /** * Checks whether block/solo tags are found in the correct amounts in HTML content * Block tags are tags which are required to have an equal amount of start and end tags, eg. "<table>...</table>" * Solo tags are tags which are required to have ONLY start tags (possibly with an XHTML ending like ".../>") * NOTICE: Correct XHTML might actually fail since "<br></br>" is allowed as well as "<br/>". However only the LATTER is accepted by this function (with "br" in the "solo-tag" list), the first example will result in a warning. * NOTICE: Correct XHTML might actually fail since "<p/>" is allowed as well as "<p></p>". However only the LATTER is accepted by this function (with "p" in the "block-tag" list), the first example will result in an ERROR! * NOTICE: Correct HTML version "something" allows eg. <p> and <li> to be NON-ended (implicitly ended by other tags). However this is NOT accepted by this function (with "p" and "li" in the block-tag list) and it will result in an ERROR! * * @param string HTML content to analyze * @param string Tag names for block tags (eg. table or div or p) in lowercase, commalist (eg. "table,div,p") * @param string Tag names for solo tags (eg. img, br or input) in lowercase, commalist ("img,br,input") * @return array Analyse data. */ function checkTagTypeCounts($content, $blockTags = 'a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags = 'br,hr,img,input,area') { $content = strtolower($content); $analyzedOutput = array(); $analyzedOutput['counts'] = array(); // Counts appearances of start-tags $analyzedOutput['errors'] = array(); // Lists ERRORS $analyzedOutput['warnings'] = array(); // Lists warnings. $analyzedOutput['blocks'] = array(); // Lists stats for block-tags $analyzedOutput['solo'] = array(); // Lists stats for solo-tags // Block tags, must have endings... $blockTags = explode(',', $blockTags); foreach ($blockTags as $tagName) { $countBegin = count(preg_split('/\<' . $tagName . '(\s|\>)/s', $content)) - 1; $countEnd = count(preg_split('/\<\/' . $tagName . '(\s|\>)/s', $content)) - 1; $analyzedOutput['blocks'][$tagName] = array($countBegin, $countEnd, $countBegin - $countEnd); if ($countBegin) { $analyzedOutput['counts'][$tagName] = $countBegin; } if ($countBegin - $countEnd) { if ($countBegin - $countEnd > 0) { $analyzedOutput['errors'][$tagName] = 'There were more start-tags (' . $countBegin . ') than end-tags (' . $countEnd . ') for the element "' . $tagName . '". There should be an equal amount!'; } else { $analyzedOutput['warnings'][$tagName] = 'There were more end-tags (' . $countEnd . ') than start-tags (' . $countBegin . ') for the element "' . $tagName . '". There should be an equal amount! However the problem is not fatal.'; } } } // Solo tags, must NOT have endings... $soloTags = explode(',', $soloTags); foreach ($soloTags as $tagName) { $countBegin = count(preg_split('/\<' . $tagName . '(\s|\>)/s', $content)) - 1; $countEnd = count(preg_split('/\<\/' . $tagName . '(\s|\>)/s', $content)) - 1; $analyzedOutput['solo'][$tagName] = array($countBegin, $countEnd); if ($countBegin) { $analyzedOutput['counts'][$tagName] = $countBegin; } if ($countEnd) { $analyzedOutput['warnings'][$tagName] = 'There were end-tags found (' . $countEnd . ') for the element "' . $tagName . '". This was not expected (although XHTML technically allows it).'; } } return $analyzedOutput; } /********************************* * * Clean HTML code * *********************************/ /** * Function that can clean up HTML content according to configuration given in the $tags array. * * Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this: $tags = array_flip(explode(',','b,a,i,u')) * If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options: * * $tags[$tagname] = Array( * 'overrideAttribs' => '' If set, this string is preset as the attributes of the tag * 'allowedAttribs' => '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed. * 'fixAttrib' => Array( * '[attribute name]' => Array ( * 'set' => Force the attribute value to this value. * 'unset' => Boolean: If set, the attribute is unset. * 'default' => If no attribute exists by this name, this value is set as default value (if this value is not blank) * 'always' => Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists * 'trim,intval,lower,upper' => All booleans. If any of these keys are set, the value is passed through the respective PHP-functions. * 'range' => Array ('[low limit]','[high limit, optional]') Setting integer range. * 'list' => Array ('[value1/default]','[value2]','[value3]') Attribute must be in this list. If not, the value is set to the first element. * 'removeIfFalse' => Boolean/'blank'. If set, then the attribute is removed if it is 'false'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed) * 'removeIfEquals' => [value] If the attribute value matches the value set here, then it is removed. * 'casesensitiveComp' => 1 If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not. * ) * ), * 'protect' => '', Boolean. If set, the tag <> is converted to < and > * 'remap' => '', String. If set, the tagname is remapped to this tagname * 'rmTagIfNoAttrib' => '', Boolean. If set, then the tag is removed if no attributes happend to be there. * 'nesting' => '', Boolean/'global'. If set true, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>' * ) * * @param string $content; is the HTML-content being processed. This is also the result being returned. * @param array $tags; is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure. * @param string $keepAll; boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to < and > * @param integer $hSC; Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&" or "ê") * @param array Configuration array send along as $conf to the internal functions ->processContent() and ->processTag() * @return string Processed HTML content */ function HTMLcleaner($content, $tags = array(), $keepAll = 0, $hSC = 0, $addConfig = array()) { $newContent = array(); $tokArr = explode('<', $content); $newContent[] = $this->processContent(current($tokArr), $hSC, $addConfig); next($tokArr); $c = 1; $tagRegister = array(); $tagStack = array(); $inComment = FALSE; $skipTag = FALSE; while (list(, $tok) = each($tokArr)) { if ($inComment) { if (($eocPos = strpos($tok, '-->')) === FALSE) { // End of comment is not found in the token. Go futher until end of comment is found in other tokens. $newContent[$c++] = '<' . $tok; continue; } // Comment ends in the middle of the token: add comment and proceed with rest of the token $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3); $tok = substr($tok, $eocPos + 3); $inComment = FALSE; $skipTag = TRUE; } elseif (substr($tok, 0, 3) == '!--') { if (($eocPos = strpos($tok, '-->')) === FALSE) { // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment $newContent[$c++] = '<' . $tok; $inComment = TRUE; continue; } // Start and end of comment are both in the current token. Add comment and proceed with rest of the token $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3); $tok = substr($tok, $eocPos + 3); $skipTag = TRUE; } $firstChar = substr($tok, 0, 1); if (!$skipTag && preg_match('/[[:alnum:]\/]/', $firstChar) == 1) { // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..> $tagEnd = strpos($tok, '>'); if ($tagEnd) { // If there is and end-bracket... tagEnd can't be 0 as the first character can't be a > $endTag = $firstChar == '/' ? 1 : 0; $tagContent = substr($tok, $endTag, $tagEnd - $endTag); $tagParts = preg_split('/\s+/s', $tagContent, 2); $tagName = strtolower($tagParts[0]); $emptyTag = 0; if (isset($tags[$tagName])) { if (is_array($tags[$tagName])) { // If there is processing to do for the tag: if (preg_match('/^(' . self::VOID_ELEMENTS . ' )$/i', $tagName)) { $emptyTag = 1; } if (!$endTag) { // If NOT an endtag, do attribute processing (added dec. 2003) // Override attributes if (strcmp($tags[$tagName]['overrideAttribs'], '')) { $tagParts[1] = $tags[$tagName]['overrideAttribs']; } // Allowed tags if (strcmp($tags[$tagName]['allowedAttribs'], '')) { if (!strcmp($tags[$tagName]['allowedAttribs'], '0')) { // No attribs allowed $tagParts[1] = ''; } elseif (trim($tagParts[1])) { $tagAttrib = $this->get_tag_attributes($tagParts[1]); $tagParts[1] = ''; $newTagAttrib = array(); if (!($tList = $tags[$tagName]['_allowedAttribs'])) { // Just explode attribts for tag once $tList = $tags[$tagName]['_allowedAttribs'] = t3lib_div::trimExplode(',', strtolower($tags[$tagName]['allowedAttribs']), 1); } foreach ($tList as $allowTag) { if (isset($tagAttrib[0][$allowTag])) { $newTagAttrib[$allowTag] = $tagAttrib[0][$allowTag]; } } $tagParts[1] = $this->compileTagAttribs($newTagAttrib, $tagAttrib[1]); } } // Fixed attrib values if (is_array($tags[$tagName]['fixAttrib'])) { $tagAttrib = $this->get_tag_attributes($tagParts[1]); $tagParts[1] = ''; foreach ($tags[$tagName]['fixAttrib'] as $attr => $params) { if (strlen($params['set'])) { $tagAttrib[0][$attr] = $params['set']; } if (strlen($params['unset'])) { unset($tagAttrib[0][$attr]); } if (strcmp($params['default'], '') && !isset($tagAttrib[0][$attr])) { $tagAttrib[0][$attr] = $params['default']; } if ($params['always'] || isset($tagAttrib[0][$attr])) { if ($params['trim']) { $tagAttrib[0][$attr] = trim($tagAttrib[0][$attr]); } if ($params['intval']) { $tagAttrib[0][$attr] = intval($tagAttrib[0][$attr]); } if ($params['lower']) { $tagAttrib[0][$attr] = strtolower($tagAttrib[0][$attr]); } if ($params['upper']) { $tagAttrib[0][$attr] = strtoupper($tagAttrib[0][$attr]); } if ($params['range']) { if (isset($params['range'][1])) { $tagAttrib[0][$attr] = t3lib_div::intInRange($tagAttrib[0][$attr], intval($params['range'][0]), intval($params['range'][1])); } else { $tagAttrib[0][$attr] = t3lib_div::intInRange($tagAttrib[0][$attr], intval($params['range'][0])); } } if (is_array($params['list'])) { // For the class attribute, remove from the attribute value any class not in the list // Classes are case sensitive if ($attr == 'class') { $newClasses = array(); $classes = t3lib_div::trimExplode(' ', $tagAttrib[0][$attr], TRUE); foreach ($classes as $class) { if (in_array($class, $params['list'])) { $newClasses[] = $class; } } if (count($newClasses)) { $tagAttrib[0][$attr] = implode(' ', $newClasses); } else { $tagAttrib[0][$attr] = ''; } } else { if (!in_array($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['list'], $params['casesensitiveComp'], $tagName))) { $tagAttrib[0][$attr] = $params['list'][0]; } } } if (($params['removeIfFalse'] && $params['removeIfFalse'] != 'blank' && !$tagAttrib[0][$attr]) || ($params['removeIfFalse'] == 'blank' && !strcmp($tagAttrib[0][$attr], ''))) { unset($tagAttrib[0][$attr]); } if (strcmp($params['removeIfEquals'], '') && !strcmp($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['removeIfEquals'], $params['casesensitiveComp']))) { unset($tagAttrib[0][$attr]); } if ($params['prefixLocalAnchors']) { if (substr($tagAttrib[0][$attr], 0, 1) == '#') { $prefix = t3lib_div::getIndpEnv('TYPO3_REQUEST_URL'); $tagAttrib[0][$attr] = $prefix . $tagAttrib[0][$attr]; if ($params['prefixLocalAnchors'] == 2 && t3lib_div::isFirstPartOfStr($prefix, t3lib_div::getIndpEnv('TYPO3_SITE_URL'))) { $tagAttrib[0][$attr] = substr($tagAttrib[0][$attr], strlen(t3lib_div::getIndpEnv('TYPO3_SITE_URL'))); } } } if ($params['prefixRelPathWith']) { $urlParts = parse_url($tagAttrib[0][$attr]); if (!$urlParts['scheme'] && substr($urlParts['path'], 0, 1) != '/') { // If it is NOT an absolute URL (by http: or starting "/") $tagAttrib[0][$attr] = $params['prefixRelPathWith'] . $tagAttrib[0][$attr]; } } if ($params['userFunc']) { $tagAttrib[0][$attr] = t3lib_div::callUserFunction($params['userFunc'], $tagAttrib[0][$attr], $this); } } } $tagParts[1] = $this->compileTagAttribs($tagAttrib[0], $tagAttrib[1]); } } else { // If endTag, remove any possible attributes: $tagParts[1] = ''; } // Protecting the tag by converting < and > to < and > ?? if ($tags[$tagName]['protect']) { $lt = '<'; $gt = '>'; } else { $lt = '<'; $gt = '>'; } // Remapping tag name? if ($tags[$tagName]['remap']) { $tagParts[0] = $tags[$tagName]['remap']; } // rmTagIfNoAttrib if ($endTag || trim($tagParts[1]) || !$tags[$tagName]['rmTagIfNoAttrib']) { $setTag = 1; // Remove this closing tag if $tagName was among $TSconfig['removeTags'] if ($endTag && $tags[$tagName]['allowedAttribs'] === 0 && $tags[$tagName]['rmTagIfNoAttrib'] === 1) { $setTag = 0; } if ($tags[$tagName]['nesting']) { if (!is_array($tagRegister[$tagName])) { $tagRegister[$tagName] = array(); } if ($endTag) { /* if ($tags[$tagName]['nesting']=='global') { $lastEl = end($tagStack); $correctTag = !strcmp($tagName,$lastEl); } else $correctTag=1; */ $correctTag = 1; if ($tags[$tagName]['nesting'] == 'global') { $lastEl = end($tagStack); if (strcmp($tagName, $lastEl)) { if (in_array($tagName, $tagStack)) { while (count($tagStack) && strcmp($tagName, $lastEl)) { $elPos = end($tagRegister[$lastEl]); unset($newContent[$elPos]); array_pop($tagRegister[$lastEl]); array_pop($tagStack); $lastEl = end($tagStack); } } else { $correctTag = 0; // In this case the } } } if (!count($tagRegister[$tagName]) || !$correctTag) { $setTag = 0; } else { array_pop($tagRegister[$tagName]); if ($tags[$tagName]['nesting'] == 'global') { array_pop($tagStack); } } } else { array_push($tagRegister[$tagName], $c); if ($tags[$tagName]['nesting'] == 'global') { array_push($tagStack, $tagName); } } } if ($setTag) { // Setting the tag $newContent[$c++] = $this->processTag($lt . ($endTag ? '/' : '') . trim($tagParts[0] . ' ' . $tagParts[1]) . ($emptyTag ? ' /' : '' ) . $gt, $addConfig, $endTag, $lt == '<'); } } } else { $newContent[$c++] = $this->processTag('<' . ($endTag ? '/' : '') . $tagContent . '>', $addConfig, $endTag); } } elseif ($keepAll) { // This is if the tag was not defined in the array for processing: if (!strcmp($keepAll, 'protect')) { $lt = '<'; $gt = '>'; } else { $lt = '<'; $gt = '>'; } $newContent[$c++] = $this->processTag($lt . ($endTag ? '/' : '') . $tagContent . $gt, $addConfig, $endTag, $lt == '<'); } $newContent[$c++] = $this->processContent(substr($tok, $tagEnd + 1), $hSC, $addConfig); } else { $newContent[$c++] = $this->processContent('<' . $tok, $hSC, $addConfig); // There were not end-bracket, so no tag... } } else { $newContent[$c++] = $this->processContent(($skipTag ? '' : '<') . $tok, $hSC, $addConfig); // It was not a tag anyways $skipTag = FALSE; } } // Unsetting tags: foreach ($tagRegister as $tag => $positions) { foreach ($positions as $pKey) { unset($newContent[$pKey]); } } return implode('', $newContent); } /** * Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1) * * @param string Input value * @param integer Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1) * @return string Output value */ function bidir_htmlspecialchars($value, $dir) { if ($dir == 1) { $value = htmlspecialchars($value); } elseif ($dir == 2) { $value = t3lib_div::deHSCentities(htmlspecialchars($value)); } elseif ($dir == -1) { $value = str_replace('>', '>', $value); $value = str_replace('<', '<', $value); $value = str_replace('"', '"', $value); $value = str_replace('&', '&', $value); } return $value; } /** * Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a] in the $content with the $main_prefix or and alternative given by $alternatives * * @param string Prefix string * @param string HTML content * @param array Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase * @param string Suffix string (put after the resource). * @return string Processed HTML content */ function prefixResourcePath($main_prefix, $content, $alternatives = array(), $suffix = '') { $parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a,param', $content); foreach ($parts as $k => $v) { if ($k % 2) { $params = $this->get_tag_attributes($v); $tagEnd = substr($v, -2) == '/>' ? ' />' : '>'; // Detect tag-ending so that it is re-applied correctly. $firstTagName = $this->getFirstTagName($v); // The 'name' of the first tag $somethingDone = 0; $prefix = isset($alternatives[strtoupper($firstTagName)]) ? $alternatives[strtoupper($firstTagName)] : $main_prefix; switch (strtolower($firstTagName)) { // background - attribute: case 'td': case 'body': case 'table': $src = $params[0]['background']; if ($src) { $params[0]['background'] = $this->prefixRelPath($prefix, $params[0]['background'], $suffix); $somethingDone = 1; } break; // src attribute case 'img': case 'input': case 'script': case 'embed': $src = $params[0]['src']; if ($src) { $params[0]['src'] = $this->prefixRelPath($prefix, $params[0]['src'], $suffix); $somethingDone = 1; } break; case 'link': case 'a': $src = $params[0]['href']; if ($src) { $params[0]['href'] = $this->prefixRelPath($prefix, $params[0]['href'], $suffix); $somethingDone = 1; } break; // action attribute case 'form': $src = $params[0]['action']; if ($src) { $params[0]['action'] = $this->prefixRelPath($prefix, $params[0]['action'], $suffix); $somethingDone = 1; } break; // value attribute case 'param': $test = $params[0]['name']; if ($test && $test === 'movie') { if ($params[0]['value']) { $params[0]['value'] = $this->prefixRelPath($prefix, $params[0]['value'], $suffix); $somethingDone = 1; } } break; } if ($somethingDone) { $tagParts = preg_split('/\s+/s', $v, 2); $tagParts[1] = $this->compileTagAttribs($params[0], $params[1]); $parts[$k] = '<' . trim(strtolower($firstTagName) . ' ' . $tagParts[1]) . $tagEnd; } } } $content = implode('', $parts); // Fix <style> section: $prefix = isset($alternatives['style']) ? $alternatives['style'] : $main_prefix; if (strlen($prefix)) { $parts = $this->splitIntoBlock('style', $content); foreach ($parts as $k => $v) { if ($k % 2) { $parts[$k] = preg_replace('/(url[[:space:]]*\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\))/i', '\1' . $prefix . '\2' . $suffix . '\3', $parts[$k]); } } $content = implode('', $parts); } return $content; } /** * Internal sub-function for ->prefixResourcePath() * * @param string Prefix string * @param string Relative path/URL * @param string Suffix string * @return string Output path, prefixed if no scheme in input string * @access private */ function prefixRelPath($prefix, $srcVal, $suffix = '') { // Only prefix if it's not an absolute URL or // only a link to a section within the page. if (substr($srcVal, 0, 1) != '/' && substr($srcVal, 0, 1) != '#') { $urlParts = parse_url($srcVal); // only prefix URLs without a scheme if (!$urlParts['scheme']) { $srcVal = $prefix . $srcVal . $suffix; } } return $srcVal; } /** * Cleans up the input $value for fonttags. * If keepFace,-Size and -Color is set then font-tags with an allowed property is kept. Else deleted. * * @param string HTML content with font-tags inside to clean up. * @param boolean If set, keep "face" attribute * @param boolean If set, keep "size" attribute * @param boolean If set, keep "color" attribute * @return string Processed HTML content */ function cleanFontTags($value, $keepFace = 0, $keepSize = 0, $keepColor = 0) { $fontSplit = $this->splitIntoBlock('font', $value); // ,1 ?? - could probably be more stable if splitTags() was used since this depends on end-tags being properly set! foreach ($fontSplit as $k => $v) { if ($k % 2) { // font: $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v)); $newAttribs = array(); if ($keepFace && $attribArray['face']) { $newAttribs[] = 'face="' . $attribArray['face'] . '"'; } if ($keepSize && $attribArray['size']) { $newAttribs[] = 'size="' . $attribArray['size'] . '"'; } if ($keepColor && $attribArray['color']) { $newAttribs[] = 'color="' . $attribArray['color'] . '"'; } $innerContent = $this->cleanFontTags($this->removeFirstAndLastTag($v), $keepFace, $keepSize, $keepColor); if (count($newAttribs)) { $fontSplit[$k] = '<font ' . implode(' ', $newAttribs) . '>' . $innerContent . '</font>'; } else { $fontSplit[$k] = $innerContent; } } } return implode('', $fontSplit); } /** * This is used to map certain tag-names into other names. * * @param string HTML content * @param array Array with tag key=>value pairs where key is from-tag and value is to-tag * @param string Alternative less-than char to search for (search regex string) * @param string Alternative less-than char to replace with (replace regex string) * @return string Processed HTML content */ function mapTags($value, $tags = array(), $ltChar = '<', $ltChar2 = '<') { foreach ($tags as $from => $to) { $value = preg_replace('/' . preg_quote($ltChar) . '(\/)?' . $from . '\s([^\>])*(\/)?\>/', $ltChar2 . '$1' . $to . ' $2$3>', $value); } return $value; } /** * This converts htmlspecialchar()'ed tags (from $tagList) back to real tags. Eg. '<strong>' would be converted back to '<strong>' if found in $tagList * * @param string HTML content * @param string Tag list, separated by comma. Lowercase! * @return string Processed HTML content */ function unprotectTags($content, $tagList = '') { $tagsArray = t3lib_div::trimExplode(',', $tagList, 1); $contentParts = explode('<', $content); next($contentParts); // bypass the first while (list($k, $tok) = each($contentParts)) { $firstChar = substr($tok, 0, 1); if (strcmp(trim($firstChar), '')) { $subparts = explode('>', $tok, 2); $tagEnd = strlen($subparts[0]); if (strlen($tok) != $tagEnd) { $endTag = $firstChar == '/' ? 1 : 0; $tagContent = substr($tok, $endTag, $tagEnd - $endTag); $tagParts = preg_split('/\s+/s', $tagContent, 2); $tagName = strtolower($tagParts[0]); if (!strcmp($tagList, '') || in_array($tagName, $tagsArray)) { $contentParts[$k] = '<' . $subparts[0] . '>' . $subparts[1]; } else { $contentParts[$k] = '<' . $tok; } } else { $contentParts[$k] = '<' . $tok; } } else { $contentParts[$k] = '<' . $tok; } } return implode('', $contentParts); } /** * Strips tags except the tags in the list, $tagList * OBSOLETE - use PHP function strip_tags() * * @param string Value to process * @param string List of tags * @return string Output value * @ignore */ function stripTagsExcept($value, $tagList) { $tags = t3lib_div::trimExplode(',', $tagList, 1); $forthArr = array(); $backArr = array(); foreach ($tags as $theTag) { $forthArr[$theTag] = md5($theTag); $backArr[md5($theTag)] = $theTag; } $value = $this->mapTags($value, $forthArr, '<', '_'); $value = strip_tags($value); $value = $this->mapTags($value, $backArr, '_', '<'); return $value; } /** * Internal function for case shifting of a string or whole array * * @param mixed Input string/array * @param boolean If $str is a string AND this boolean(caseSensitive) is false, the string is returned in uppercase * @param string Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array. * @return string Output string, processed * @access private */ function caseShift($str, $flag, $cacheKey = '') { $cacheKey .= $flag ? 1 : 0; if (is_array($str)) { if (!$cacheKey || !isset($this->caseShift_cache[$cacheKey])) { reset($str); foreach ($str as $k => $v) { if (!$flag) { $str[$k] = strtoupper($v); } } if ($cacheKey) { $this->caseShift_cache[$cacheKey] = $str; } } else { $str = $this->caseShift_cache[$cacheKey]; } } elseif (!$flag) { $str = strtoupper($str); } return $str; } /** * Compiling an array with tag attributes into a string * * @param array Tag attributes * @param array Meta information about these attributes (like if they were quoted) * @param boolean If set, then the attribute names will be set in lower case, value quotes in double-quotes and the value will be htmlspecialchar()'ed * @return string Imploded attributes, eg: 'attribute="value" attrib2="value2"' * @access private */ function compileTagAttribs($tagAttrib, $meta = array(), $xhtmlClean = 0) { $accu = array(); foreach ($tagAttrib as $k => $v) { if ($xhtmlClean) { $attr = strtolower($k); if (strcmp($v, '') || isset($meta[$k]['dashType'])) { $attr .= '="' . htmlspecialchars($v) . '"'; } } else { $attr = $meta[$k]['origTag'] ? $meta[$k]['origTag'] : $k; if (strcmp($v, '') || isset($meta[$k]['dashType'])) { $dash = $meta[$k]['dashType'] ? $meta[$k]['dashType'] : (t3lib_div::testInt($v) ? '' : '"'); $attr .= '=' . $dash . $v . $dash; } } $accu[] = $attr; } return implode(' ', $accu); } /** * Get tag attributes, the classic version (which had some limitations?) * * @param string The tag * @param boolean De-htmlspecialchar flag. * @return array * @access private */ function get_tag_attributes_classic($tag, $deHSC = 0) { $attr = $this->get_tag_attributes($tag, $deHSC); return is_array($attr[0]) ? $attr[0] : array(); } /** * Indents input content with $number instances of $indentChar * * @param string Content string, multiple lines. * @param integer Number of indents * @param string Indent character/string * @return string Indented code (typ. HTML) */ function indentLines($content, $number = 1, $indentChar = TAB) { $preTab = str_pad('', $number * strlen($indentChar), $indentChar); $lines = explode(LF, str_replace(CR, '', $content)); foreach ($lines as $k => $v) { $lines[$k] = $preTab . $v; } return implode(LF, $lines); } /** * Converts TSconfig into an array for the HTMLcleaner function. * * @param array TSconfig for HTMLcleaner * @param array Array of tags to keep (?) * @return array * @access private */ function HTMLparserConfig($TSconfig, $keepTags = array()) { // Allow tags (base list, merged with incoming array) $alTags = array_flip(t3lib_div::trimExplode(',', strtolower($TSconfig['allowTags']), 1)); $keepTags = array_merge($alTags, $keepTags); // Set config properties. if (is_array($TSconfig['tags.'])) { foreach ($TSconfig['tags.'] as $key => $tagC) { if (!is_array($tagC) && $key == strtolower($key)) { if (!strcmp($tagC, '0')) { unset($keepTags[$key]); } if (!strcmp($tagC, '1') && !isset($keepTags[$key])) { $keepTags[$key] = 1; } } } foreach ($TSconfig['tags.'] as $key => $tagC) { if (is_array($tagC) && $key == strtolower($key)) { $key = substr($key, 0, -1); if (!is_array($keepTags[$key])) { $keepTags[$key] = array(); } if (is_array($tagC['fixAttrib.'])) { foreach ($tagC['fixAttrib.'] as $atName => $atConfig) { if (is_array($atConfig)) { $atName = substr($atName, 0, -1); if (!is_array($keepTags[$key]['fixAttrib'][$atName])) { $keepTags[$key]['fixAttrib'][$atName] = array(); } $keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName], $atConfig); // Candidate for t3lib_div::array_merge() if integer-keys will some day make trouble... if (strcmp($keepTags[$key]['fixAttrib'][$atName]['range'], '')) { $keepTags[$key]['fixAttrib'][$atName]['range'] = t3lib_div::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['range']); } if (strcmp($keepTags[$key]['fixAttrib'][$atName]['list'], '')) { $keepTags[$key]['fixAttrib'][$atName]['list'] = t3lib_div::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['list']); } } } } unset($tagC['fixAttrib.']); unset($tagC['fixAttrib']); $keepTags[$key] = array_merge($keepTags[$key], $tagC); // Candidate for t3lib_div::array_merge() if integer-keys will some day make trouble... } } } // localNesting if ($TSconfig['localNesting']) { $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['localNesting']), 1); foreach ($lN as $tn) { if (isset($keepTags[$tn])) { $keepTags[$tn]['nesting'] = 1; } } } if ($TSconfig['globalNesting']) { $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['globalNesting']), 1); foreach ($lN as $tn) { if (isset($keepTags[$tn])) { if (!is_array($keepTags[$tn])) { $keepTags[$tn] = array(); } $keepTags[$tn]['nesting'] = 'global'; } } } if ($TSconfig['rmTagIfNoAttrib']) { $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['rmTagIfNoAttrib']), 1); foreach ($lN as $tn) { if (isset($keepTags[$tn])) { if (!is_array($keepTags[$tn])) { $keepTags[$tn] = array(); } $keepTags[$tn]['rmTagIfNoAttrib'] = 1; } } } if ($TSconfig['noAttrib']) { $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['noAttrib']), 1); foreach ($lN as $tn) { if (isset($keepTags[$tn])) { if (!is_array($keepTags[$tn])) { $keepTags[$tn] = array(); } $keepTags[$tn]['allowedAttribs'] = 0; } } } if ($TSconfig['removeTags']) { $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['removeTags']), 1); foreach ($lN as $tn) { $keepTags[$tn] = array(); $keepTags[$tn]['allowedAttribs'] = 0; $keepTags[$tn]['rmTagIfNoAttrib'] = 1; } } // Create additional configuration: $addConfig = array(); if ($TSconfig['xhtml_cleaning']) { $addConfig['xhtml'] = 1; } return array( $keepTags, '' . $TSconfig['keepNonMatchedTags'], intval($TSconfig['htmlSpecialChars']), $addConfig ); } /** * Tries to convert the content to be XHTML compliant and other stuff like that. * STILL EXPERIMENTAL. See comments below. * * What it does NOT do (yet) according to XHTML specs.: * - Wellformedness: Nesting is NOT checked * - name/id attribute issue is not observed at this point. * - Certain nesting of elements not allowed. Most interesting, <PRE> cannot contain img, big,small,sub,sup ... * - Wrapping scripts and style element contents in CDATA - or alternatively they should have entitites converted. * - Setting charsets may put some special requirements on both XML declaration/ meta-http-equiv. (C.9) * - UTF-8 encoding is in fact expected by XML!! * - stylesheet element and attribute names are NOT converted to lowercase * - ampersands (and entities in general I think) MUST be converted to an entity reference! (&s;). This may mean further conversion of non-tag content before output to page. May be related to the charset issue as a whole. * - Minimized values not allowed: Must do this: selected="selected" * * What it does at this point: * - All tags (frame,base,meta,link + img,br,hr,area,input) is ended with "/>" - others? * - Lowercase for elements and attributes * - All attributes in quotes * - Add "alt" attribute to img-tags if it's not there already. * * @param string Content to clean up * @return string Cleaned up content returned. * @access private */ function XHTML_clean($content) { $content = $this->HTMLcleaner( $content, array(), // No tags treated specially 1, // Keep ALL tags. 0, // All content is htmlspecialchar()'ed (or ??) - if we do, <script> content will break... array('xhtml' => 1) ); return $content; } /** * Processing all tags themselves * (Some additions by Sacha Vorbeck) * * @param string Tag to process * @param array Configuration array passing instructions for processing. If count()==0, function will return value unprocessed. See source code for details * @param boolean Is endtag, then set this. * @param boolean If set, just return value straight away * @return string Processed value. * @access private */ function processTag($value, $conf, $endTag, $protected = 0) { // Return immediately if protected or no parameters if ($protected || !count($conf)) { return $value; } // OK then, begin processing for XHTML output: // STILL VERY EXPERIMENTAL!! if ($conf['xhtml']) { if ($endTag) { // Endtags are just set lowercase right away $value = strtolower($value); } elseif (substr($value, 0, 4) != '<!--') { // ... and comments are ignored. $inValue = substr($value, 1, (substr($value, -2) == '/>' ? -2 : -1)); // Finding inner value with out < > list($tagName, $tagP) = preg_split('/\s+/s', $inValue, 2); // Separate attributes and tagname $tagName = strtolower($tagName); // Process attributes $tagAttrib = $this->get_tag_attributes($tagP); if (!strcmp($tagName, 'img') && !isset($tagAttrib[0]['alt'])) { $tagAttrib[0]['alt'] = ''; } // Set alt attribute for all images (not XHTML though...) if (!strcmp($tagName, 'script') && !isset($tagAttrib[0]['type'])) { $tagAttrib[0]['type'] = 'text/javascript'; } // Set type attribute for all script-tags $outA = array(); foreach ($tagAttrib[0] as $attrib_name => $attrib_value) { // Set attributes: lowercase, always in quotes, with htmlspecialchars converted. $outA[] = $attrib_name . '="' . $this->bidir_htmlspecialchars($attrib_value, 2) . '"'; } $newTag = '<' . trim($tagName . ' ' . implode(' ', $outA)); // All tags that are standalone (not wrapping, not having endtags) should be ended with '/>' if (t3lib_div::inList('img,br,hr,meta,link,base,area,input,param,col', $tagName) || substr($value, -2) == '/>') { $newTag .= ' />'; } else { $newTag .= '>'; } $value = $newTag; } } return $value; } /** * Processing content between tags for HTML_cleaner * * @param string The value * @param integer Direction, either -1 or +1. 0 (zero) means no change to input value. * @param mixed Not used, ignore. * @return string The processed value. * @access private */ function processContent($value, $dir, $conf) { if ($dir != 0) { $value = $this->bidir_htmlspecialchars($value, $dir); } return $value; } } if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_parsehtml.php'])) { include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_parsehtml.php']); } ?>