0byt3m1n1

Path: /data/applications/aps.bak/lifetype/1.2.11-2/standard/htdocs/class/data/utf8/ [ Home ]
File: utf8_funcs.php
<?php

/**
 * utf8 interrelated functions 
 * @autor CB
 * @email cb.utblog@gmail.com
 * @homepage http://www.utblog.com/plog/CB
 * @date 25 Jul 2005
 */

/**
 * int utf8_isValidChar(string $inputStr, $start = 0)
 * Is it a valid utf8 character
 * @param $inputStr input string
 * @param $start start index
 * @return the ascii bytes of the utf8 char if it is a valid utf8 char. 0 if input array is empty, or -1 if it's invalid 
 * @note don't use pass-by-reference for $inArr here, otherwise efficiency will decreased significantly 
 * @note change param $inArr from char array to string ($inputStr), for porformance purpose.
 * @note preg_split consumes too much memory and cpu when split a big string to char array
 */
function utf8_isValidChar($inputStr, $start = 0)
{
	$size = strlen($inputStr);
	if($size <=0 || $start < 0 || $size <= $start) return 0;

	$inOrd = ord($inputStr{$start});
	$us = 0;
	if($inOrd <= 0x7F) { //0xxxxxxx
		return 1;
	} else if($inOrd >= 0xC0 && $inOrd <= 0xDF ) { //110xxxxx 10xxxxxx
		$us = 2;
	} else if($inOrd >= 0xE0 && $inOrd <= 0xEF ) { //1110xxxx 10xxxxxx 10xxxxxx
		$us = 3;
	} else if($inOrd >= 0xF0 && $inOrd <= 0xF7 ) { //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		$us = 4;
	} else if($inOrd >= 0xF8 && $inOrd <= 0xFB ) { //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		$us = 5;
	} else if($inOrd >= 0xFC && $inOrd <= 0xFD ) { //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		$us = 6;
	} else
		return -1;

	if($size - $start < $us)
		return -1;

	for($i=1; $i<$us; $i++)
	{
		$od = ord($inputStr{$start+$i}); 
		if($od <0x80 || $od > 0xBF)
			return -1;
	}
	return $us;
}

/**
 * mix utf8_substr(string $inputString, int $start_index, int $length = -1, bool $ignore_invalid_utf8_char = true)
 * @param $inputStr
 * @param $start start index, must be large than 0
 * @param $length. if $length <0, return all text begin from $start
 * @param $ignore_error whether ignore the invalid characters (in return string, these invalid chars will be replaced with '?') or not. default is true (ignore)
 * @return the substring, or false (empty string '')
 */
function utf8_substr($inputStr, $start, $length = -1, $ignore_error = true)
{
	if($start<0 || $length == 0)
		return false;
	//discard preg_split function. it consumes too much system resource when it tries to split a big string to pieces
	//$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
	//find start
	$si = 0;
	$si_single = 0;
	while($si < $start)
	{
		$hm = utf8_isValidChar($inputStr, $si_single);
		if($hm == -1)
		{
			//ignore invalid character?
			if(!$ignore_error)
				return false;
			//array_shift is very slow
			//array_shift($rawArr); 
			$si++;
			$si_single++;
		}
		else if($hm == 0)
		{
			//$start is bigger than the utf8_length of inputString
			return false;
		}
		else
		{
			//for($i=0; $i<$hm; $i++) array_shift($rawArr);
			$si++;
			$si_single += $hm;
		}
	}
	if($length < 0)
		//return implode('', $rawArr);
		return substr($inputStr, $si_single);
	$retArr = array();
	$li = 0;
	while($li < $length)
	{
		$hm = utf8_isValidChar($inputStr, $si_single);
		if($hm == -1)
		{
			if(!$ignore_error)
				return false;
			$retArr[] = '?'; 
			//array_shift($rawArr);
			$li++;
			$si_single++;
		}
		else if($hm == 0)
		{
			//end of string
			return implode('', $retArr);
		}
		else
		{
			//for($i=0; $i<$hm; $i++) $retArr[] = array_shift($rawArr);
			for($i=0; $i<$hm; $i++) $retArr[] = $inputStr{$si_single++};
			$li++;
		}
	}
	return implode('', $retArr);
}

/**
 * int utf8_strlen(string $inputString, bool $ignore_invalid_utf8_char = true)
 * @return length of string encoded as utf8 ( how many utf8 characters )
 * -1 if given $ignore_error is false and there's invalid utf8 char in the inputString
 * @note if $ignore_error is true (the default value), every invalid utf8 character will be count as ONE utf8 char
 */
function utf8_strlen($inputStr, $ignore_error = true)
{
	//$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
	$len = 0;
	$si_single = 0;
	while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0)
	{
		if($hm == -1)
		{
			if(!$ignore_error)
				return -1;
			//array_shift($rawArr);
			$si_single++;
		}
		else
			//for($i=0; $i<$hm; $i++) array_shift($rawArr);
			$si_single += $hm;
		$len++;
	}
	return $len;
}

/**
 * int utf8_proportion(string $inputString)
 * @param $inputString
 * @return percentage of valid utf8 chars of $inputString
 * @see http://www.utblog.com/plog/1/article/292
 */ 
function utf8_proportion($inputStr)
{
	//$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
	//$rawLen = count($rawArr);
	$rawLen = strlen($inputStr);
	if($rawLen == 0)
		return 100;
	$validChars = 0;
	$si_single = 0;
	while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0)
	{
		if($hm == -1)
		{
			//array_shift($rawArr);
			$si_single++;
			continue;
		}
		//for($i=0; $i<$hm; $i++) array_shift($rawArr);
		$validChars += $hm;
		$si_single += $hm;
	}
	if($validChars == $rawLen)
		return 100;
	else
		return (int)($validChars*100.0/$rawLen);
}

?>