0byt3m1n1

Path: /data/applications/aps/typo3/12.4.7/standard/htdocs/typo3/sysext/core/Classes/Html/ [ Home ]
File: HtmlCropper.php
<?php

declare(strict_types=1);

/*
 * This file is part of the TYPO3 CMS project.
 *
 * It is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License, either version 2
 * of the License, or any later version.
 *
 * For the full copyright and license information, please read the
 * LICENSE.txt file that was distributed with this source code.
 *
 * The TYPO3 project - inspiring people to share!
 */

namespace TYPO3\CMS\Core\Html;

use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerAwareTrait;

class HtmlCropper implements LoggerAwareInterface
{
    use LoggerAwareTrait;

    /**
     * Implements "cropHTML" which is a modified "substr" function allowing to limit a string length to a certain number
     * of chars (from either start or end of string) and having a pre/postfix applied if the string really was cropped.
     *
     * @param string $content The string to perform the operation on
     * @param int $numberOfChars Max number of chars of the string. Negative value means cropping from end of string.
     * @param string $replacementForEllipsis  The pre/postfix string to apply if cropping occurs.
     * @param bool $cropToSpace If true then crop will be applied at nearest space.
     * @return string The processed input value.
     */
    public function crop(string $content, int $numberOfChars, string $replacementForEllipsis, bool $cropToSpace): string
    {
        $cropFromRight = $numberOfChars < 0;
        $absChars = abs($numberOfChars);
        $replacementForEllipsis = trim($replacementForEllipsis);
        // Split $content into an array(even items in the array are outside the tags, odd numbers are tag-blocks).
        $tags = 'a|abbr|address|area|article|aside|audio|b|bdi|bdo|blockquote|body|br|button|caption|cite|code|col|colgroup|data|datalist|dd|del|dfn|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|h1|h2|h3|h4|h5|h6|header|hr|i|iframe|img|input|ins|kbd|keygen|label|legend|li|link|main|map|mark|meter|nav|object|ol|optgroup|option|output|p|param|pre|progress|q|rb|rp|rt|rtc|ruby|s|samp|section|select|small|source|span|strong|sub|sup|table|tbody|td|textarea|tfoot|th|thead|time|tr|track|u|ul|ut|var|video|wbr';
        $tagsRegEx = '
			(
				(?:
					<!--.*?-->					# a comment
					|
					<canvas[^>]*>.*?</canvas>   # a canvas tag
					|
					<script[^>]*>.*?</script>   # a script tag
					|
					<noscript[^>]*>.*?</noscript> # a noscript tag
					|
					<template[^>]*>.*?</template> # a template tag
				)
				|
				</?(?:' . $tags . ')+			# opening tag (\'<tag\') or closing tag (\'</tag\')
				(?:
					(?:
						(?:
							\\s+\\w[\\w-]*		# EITHER spaces, followed by attribute names
							(?:
								\\s*=?\\s*		# equals
								(?>
									".*?"		# attribute values in double-quotes
									|
									\'.*?\'		# attribute values in single-quotes
									|
									[^\'">\\s]+	# plain attribute values
								)
							)?
						)
						|						# OR a single dash (for TYPO3 link tag)
						(?:
							\\s+-
						)
					)+\\s*
					|							# OR only spaces
					\\s*
				)
				/?>								# closing the tag with \'>\' or \'/>\'
			)';
        $splittedContent = preg_split('%' . $tagsRegEx . '%xs', $content, -1, PREG_SPLIT_DELIM_CAPTURE);
        if ($splittedContent === false) {
            $this->logger->debug('Unable to split "{content}" into tags.', ['content' => $content]);
            $splittedContent = [];
        }

        // Reverse array if we are cropping from right.
        if ($cropFromRight) {
            $splittedContent = array_reverse($splittedContent);
        }
        // Crop the text (chars of tag-blocks are not counted).
        $strLen = 0;
        // This is the offset of the content item which was cropped.
        $croppedOffset = null;
        $countSplittedContent = count($splittedContent);
        for ($offset = 0; $offset < $countSplittedContent; $offset++) {
            if ($offset % 2 === 0) {
                $fullTempContent = $splittedContent[$offset];
                $thisStrLen = mb_strlen(html_entity_decode($fullTempContent, ENT_COMPAT, 'UTF-8'), 'utf-8');
                if ($strLen + $thisStrLen > $absChars) {
                    $tempProcessedContent = '';
                    $croppedOffset = $offset;
                    $cropPosition = $absChars - $strLen;
                    // The snippet "&[^&\s;]{2,8};" in the RegEx below represents entities.
                    $entityPattern = '/&[^&\\s;]{2,8};/';
                    preg_match_all($entityPattern, $fullTempContent, $matches);
                    $entityMatches = $matches[0];

                    // If we have found any html entities, these should be counted as 1 character.
                    // Strategy is to replace all found entities with an arbitrary character ($)
                    // and use this new string to count offsets.
                    if (($entityMatches ?? []) !== []) {
                        $escapedContent = str_replace('$', ' ', $fullTempContent);
                        $replacedContent = preg_replace($entityPattern, '$', $escapedContent, -1, $count);
                        $croppedContent = !$cropFromRight ? mb_substr($replacedContent, 0, $cropPosition) : mb_substr($replacedContent, $numberOfChars, $cropPosition);

                        // In case of negative offsets, we need to reverse everything.
                        // Because the string is cropped from behind, the entities
                        // have to be replaced in reverse, too.
                        if ($cropFromRight) {
                            $croppedContent = strrev($croppedContent);
                            $entityMatches = array_reverse($entityMatches);
                        }
                        foreach ($entityMatches as $entity) {
                            $croppedContent = preg_replace('/\$/', $entity, $croppedContent, 1);
                        }
                        $cropPosition = mb_strlen($croppedContent);
                    }

                    // Main cropping. Note the +1 and -1. These are there to be able to
                    // check for space characters later on.
                    $fullTempContent = !$cropFromRight ? mb_substr($fullTempContent, 0, $cropPosition + 1) : mb_substr($fullTempContent, -$cropPosition - 1);

                    // Crop to space means, we ensure to crop before (or after) a space.
                    // If there are no spaces, this option has no effect.
                    $cropToSpaceApplied = false;
                    if ($cropToSpace) {
                        $exploded = explode(' ', $fullTempContent);
                        if (count($exploded) > 1) {
                            if (!$cropFromRight && $exploded[count($exploded) - 1] !== ' ') {
                                array_pop($exploded);
                                $cropToSpaceApplied = true;
                            } elseif ($exploded[0] !== ' ') {
                                array_shift($exploded);
                                $cropToSpaceApplied = true;
                            }
                        }
                        $fullTempContent = implode(' ', $exploded);
                    }

                    // Only remove the extra character again, if crop2space did not apply anything.
                    if (!$cropToSpaceApplied) {
                        $fullTempContent = !$cropFromRight ? mb_substr($fullTempContent, 0, -1) : mb_substr($fullTempContent, 1);
                    }

                    $splittedContent[$offset] = $fullTempContent;
                    break;
                }
                $strLen += $thisStrLen;
            }
        }
        // Close cropped tags.
        $closingTags = [];
        if ($croppedOffset !== null) {
            $openingTagRegEx = '#^<(\\w+)(?:\\s|>)#';
            $closingTagRegEx = '#^</(\\w+)(?:\\s|>)#';
            for ($offset = $croppedOffset - 1; $offset >= 0; $offset = $offset - 2) {
                if (substr($splittedContent[$offset], -2) === '/>') {
                    // Ignore empty element tags (e.g. <br />).
                    continue;
                }
                preg_match($numberOfChars < 0 ? $closingTagRegEx : $openingTagRegEx, $splittedContent[$offset], $matches);
                $tagName = $matches[1] ?? null;
                if ($tagName !== null) {
                    // Seek for the closing (or opening) tag.
                    $countSplittedContent = count($splittedContent);
                    for ($seekingOffset = $offset + 2; $seekingOffset < $countSplittedContent; $seekingOffset = $seekingOffset + 2) {
                        preg_match($numberOfChars < 0 ? $openingTagRegEx : $closingTagRegEx, $splittedContent[$seekingOffset], $matches);
                        $seekingTagName = $matches[1] ?? null;
                        if ($tagName === $seekingTagName) {
                            // We found a matching tag.
                            // Add closing tag only if it occurs after the cropped content item.
                            if ($seekingOffset > $croppedOffset) {
                                $closingTags[] = $splittedContent[$seekingOffset];
                            }
                            break;
                        }
                    }
                }
            }
            // Drop the cropped items of the content array. The $closingTags will be added later on again.
            array_splice($splittedContent, $croppedOffset + 1);
        }
        $splittedContent = array_merge($splittedContent, [
            $croppedOffset !== null ? $replacementForEllipsis : '',
        ], $closingTags);
        // Reverse array once again if we are cropping from the end.
        if ($numberOfChars < 0) {
            $splittedContent = array_reverse($splittedContent);
        }
        return implode('', $splittedContent);
    }
}